Beispiel #1
0
    def get_collection_dict(self, collection_name):
        self.sanity_check(collection_name)

        c = self.collections_dict[collection_name]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collection_dict_from_db = database.get_some_rows_from_table_as_dict(t.collections_splits_table_name, 'collection_name="%s"' % collection_name)
        database.disconnect()

        collection_dict_to_return = {}

        for entry in list(collection_dict_from_db.values()):
            collection_name = entry['collection_name']
            bin_name = entry['bin_name']
            split = entry['split']

            if bin_name in collection_dict_to_return:
                collection_dict_to_return[bin_name].append(split)
            else:
                collection_dict_to_return[bin_name] = [split]

        return collection_dict_to_return
Beispiel #2
0
    def list_data_keys(self):
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))
        additional_data_keys = sorted(
            database.get_single_column_from_table(self.table_name,
                                                  'data_key',
                                                  unique=True))

        if not len(additional_data_keys):
            self.run.info_single(
                'There are no additional data for %s in this database :/' %
                self.target,
                nl_before=1,
                nl_after=1,
                mc='red')
        else:
            self.run.warning('',
                             'AVAILABLE DATA KEYS FOR %s (%d FOUND)' %
                             (self.target.upper(), len(additional_data_keys)),
                             lc='yellow')
            for data_key in additional_data_keys:
                rows = database.get_some_rows_from_table_as_dict(
                    self.table_name, 'data_key="%s"' % data_key)

                if self.target == 'layer_orders':
                    self.run.info_single(
                        '%s (%s)' %
                        (data_key, list(rows.values())[0]['data_type']),
                        nl_after=1
                        if data_key == additional_data_keys[-1] else 0)
                else:
                    self.run.info_single(
                        '%s (%s, describes %d %s)' %
                        (data_key, list(rows.values())[0]['data_type'],
                         len(rows), self.target),
                        nl_after=1
                        if data_key == additional_data_keys[-1] else 0)

        database.disconnect()
Beispiel #3
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_contigs_db(db_path)

    # make sure the current version matches
    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    progress.new("Trying to upgrade the contigs database")
    progress.update('...')

    contigs_db.remove_meta_key_value_pair('project_name')
    contigs_db.set_meta_value('project_name', "NO_NAME")
    contigs_db.commit()

    # set the version
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    # gene name changes:
    contigs_db._exec('''UPDATE %s SET genes = replace(genes, '%s', '%s') WHERE source LIKE 'Rinke_et_al';''' % (t.hmm_hits_info_table_name, 'Ribosomal_S12', 'Ribosom_S12_S23'))
    contigs_db._exec('''UPDATE %s SET genes = replace(genes, '%s', '%s') Where source LIKE 'Rinke_et_al';''' % (t.hmm_hits_info_table_name, 'UPF0027', 'RtcB'))
    contigs_db._exec('''UPDATE %s SET genes = replace(genes, '%s', '%s') Where source LIKE 'Rinke_et_al';''' % (t.hmm_hits_info_table_name, '‐', '-')) # first - is not ASCII

    # bye
    contigs_db.disconnect()

    # bye to you too
    progress.end()

    dbops.update_description_in_db(db_path, 'No description is given')

    run.info_single("The contigs database is now %s! All this upgrade did was to associate your contigs db with a "
                    "project name (which happened to be 'NO_NAME', because anvi'o likes you very much)" \
                                            % (next_version), nl_after=1, nl_before=1, mc='green')
Beispiel #4
0
    def populate_collections_dict(self, db_path, version):
        database = db.DB(db_path, version)
        db_type = database.get_meta_value('db_type')
        collections_info_table = database.get_table_as_dict(
            t.collections_info_table_name)
        database.disconnect()

        # collections info must be read only if its coming from the contigs database.
        if db_type == 'contigs':
            read_only = True
        elif db_type == 'profile':
            read_only = False
        else:
            raise ConfigError, 'Collections class does not know about this "%s" database type :/' % db_type

        for collection_name in collections_info_table:
            self.collections_dict[collection_name] = collections_info_table[
                collection_name]
            self.collections_dict[collection_name]['read_only'] = read_only
            self.collections_dict[collection_name]['source_db_path'] = db_path
            self.collections_dict[collection_name][
                'source_db_version'] = version
Beispiel #5
0
    def get_genome_hash_for_internal_genome(self, entry):
        self.is_proper_db(entry['contigs_db_path'], db_type='contigs')
        split_names_of_interest = self.get_split_names_of_interest_for_internal_genome(
            entry)
        contigs_db_hash = db.DB(
            entry['contigs_db_path'], None,
            ignore_version=True).get_meta_value('contigs_db_hash')
        genome_hash = hashlib.sha224('_'.join(
            [''.join(split_names_of_interest),
             contigs_db_hash]).encode('utf-8')).hexdigest()[0:12]

        if genome_hash in self.genome_hash_to_genome_name:
            if self.skip_checking_genome_hashes:
                if genome_hash in self.internal_genomes_with_identical_hashes:
                    self.internal_genomes_with_identical_hashes[
                        genome_hash].add(entry['name'])
                    self.internal_genomes_with_identical_hashes[
                        genome_hash].add(
                            self.genome_hash_to_genome_name[genome_hash])
                else:
                    self.internal_genomes_with_identical_hashes[
                        genome_hash] = set([
                            self.genome_hash_to_genome_name[genome_hash],
                            entry['name']
                        ])
            else:
                self.progress.reset()
                genome_1, genome_2 = self.genome_hash_to_genome_name[
                    genome_hash], entry['name']
                raise ConfigError(
                    "According to hash values anvi'o has been generating for your internal genomes, not all genomes you have seem to be uniuqe. "
                    "It is most likely you unintentionally listed the same information for different genome names. If you would like "
                    "to double check, genome %s (in '%s') and genome %s (in '%s') seem to have the same hash (so they are basically the same genomes). "
                    "If you are aware of this and/or if you would like anvi'o to not check genome hashes, please use the flag "
                    "`--skip-checking-genome-hashes`." %
                    (genome_1, self.genomes[genome_1]['collection_id'],
                     genome_2, self.genomes[genome_2]['collection_id']))

        return genome_hash
Beispiel #6
0
    def __init__(self,
                 db_path,
                 num_threads_to_use=1,
                 run=run,
                 progress=progress):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path

        utils.is_contigs_db(self.db_path)

        self.contigs_db_hash = db.DB(
            self.db_path, utils.get_required_version_for_db(
                self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run,
                       progress)

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            if self.genes_are_called:
                self.run.warning(
                    "Tables in this contigs database that should contain gene calls are empty despite the fact that\
                                  you didn't skip the gene calling step while generating this contigs database. This probably means\
                                  that the gene caller did not find any genes among contigs. This is OK for now. But might explode\
                                  later. If it does explode and you decide to let us know about that problem, please remember to mention\
                                  this warning. By the way, this warning probably has been seen by like only 2 people on the planet. Who\
                                  works with contigs with no gene calls? A better implementation of anvi'o will unite researchers who\
                                  study weird stuff.")
            else:
                self.run.warning(
                    "It seems you have skipped gene calling step while generating your contigs database, and you have no\
                                  genes calls in tables that should contain gene calls. Anvi'o will let you go with this since some HMM\
                                  sources only operate on DNA sequences, and at this point it doesn't know which HMMs you wish to run.\
                                  If the lack of genes causes a problem, you will get another error message later probably :/"
                )

        self.set_next_available_id(t.hmm_hits_table_name)
        self.set_next_available_id(t.hmm_hits_splits_table_name)
Beispiel #7
0
    def __init__(self, args):
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.db_path = A('pan_or_profile_db') or A('profile_db') or A('pan_db')
        self.just_do_it = A('just_do_it')

        if not self.db_path:
            raise ConfigError(
                "The AdditionalAndOrderDataBaseClass is inherited with an args object that did not "
                "contain any database path :/ Even though any of the following would "
                "have worked: `pan_or_profile_db`, `profile_db`, `pan_db` :(")

        if not self.table_name:
            raise ConfigError(
                "The AdditionalAndOrderDataBaseClass does not know anything about the table it should "
                "be working with.")

        database = db.DB(self.db_path, None, ignore_version=True)
        self.additional_data_keys = database.get_single_column_from_table(
            self.table_name, 'data_key')
        database.disconnect()

        Table.__init__(self, self.db_path, None, self.run, self.progress)
Beispiel #8
0
    def add(self, keys_list, data_dict, skip_check_names=False):
        if not isinstance(keys_list, list):
            raise ConfigError("List of keys must be of type `list`. Go away.")

        if not isinstance(data_dict, dict):
            raise ConfigError("Nope. Your data must be of type `dict`.")

        self.run.warning(None, 'New additional data...', lc="yellow")
        key_types = {}
        for key in keys_list:
            if '!' in key:
                predicted_key_type = "stackedbar"
            else:
                type_class = utils.get_predicted_type_of_items_in_a_dict(
                    data_dict, key)
                predicted_key_type = type_class.__name__ if type_class else None

            key_types[key] = predicted_key_type
            self.run.info('Key "%s"' % key, 'Predicted type: %s' % (key_types[key]), \
                                            nl_after = 1 if key == keys_list[-1] else 0)

        db_entries = []
        self.set_next_available_id(self.table_name)
        for item_name in data_dict:
            for key in data_dict[item_name]:
                db_entries.append(
                    tuple([
                        self.next_id(self.table_name), item_name, key,
                        data_dict[item_name][key], key_types[key]
                    ]))

        database = db.DB(self.db_path, None, ignore_version=True)
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?)''' % self.table_name,
            db_entries)
        database.disconnect()

        self.run.info('New data added to the db',
                      '%s.' % (', '.join(keys_list)))
Beispiel #9
0
    def create(self, functions_dict, drop_previous_annotations_first=False):
        self.sanity_check()

        # open connection
        database = db.DB(self.db_path,
                         utils.get_required_version_for_db(self.db_path))

        # Add the new sources to existing sources
        gene_function_sources = set(
            [v['source'] for v in list(functions_dict.values())])
        self.add_new_sources_to_functional_sources(
            gene_function_sources,
            database,
            drop_previous_annotations_first=drop_previous_annotations_first)

        unique_num_genes = len(
            set([v['gene_callers_id'] for v in list(functions_dict.values())]))

        # push the data
        db_entries = [
            tuple([
                functions_dict[v][h]
                for h in t.gene_function_calls_table_structure
            ]) for v in functions_dict
        ]
        database._exec_many(
            '''INSERT INTO %s VALUES (?,?,?,?,?)''' %
            t.gene_function_calls_table_name, db_entries)

        # disconnect like a pro.
        database.disconnect()

        sources_string = ", ".join(gene_function_sources)
        self.run.info(
            'Gene functions',
            f"{len(functions_dict)} function calls from {len(gene_function_sources)} sources ({sources_string}) "
            f"for {unique_num_genes} unique gene calls has\
                                         been added to the contigs database.")
Beispiel #10
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_profile_db(db_path)

    # make sure the version is accurate
    profile_db = db.DB(db_path, None, ignore_version=True)
    if str(profile_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this profile database is not %s (hence, this script cannot really do anything)."
            % current_version)

    for table_name in ['layer_additional_data', 'item_additional_data']:
        new_table = divide_stackedbar_to_multiple_entries(
            profile_db.get_table_as_dict(table_name))
        profile_db._exec("DELETE FROM '%s'" % table_name)
        new_entry_counter = 0
        for entry_id in new_table:
            profile_db.insert(
                table_name, (new_entry_counter, *new_table[entry_id].values()))
            new_entry_counter += 1

    #set the version
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version(next_version)

    # bye
    profile_db.disconnect()
    progress.end()

    run.info_single(
        "Your profile db is now %s (and anvi'o is as surprised as you are)." %
        next_version,
        nl_after=1,
        nl_before=1,
        mc='green')
Beispiel #11
0
    def remove_source(self, source):
        """Remove an HMM source from the database."""

        tables_with_source = [
            t.hmm_hits_info_table_name,
            t.hmm_hits_table_name,
            t.hmm_hits_splits_table_name,
            t.genes_in_contigs_table_name,
            t.gene_function_calls_table_name,
        ]

        tables_with_gene_callers_id = [
            t.gene_amino_acid_sequences_table_name,
            t.genes_taxonomy_table_name, t.genes_in_splits_table_name
        ]

        # delete entries from tables with 'source' column
        self.delete_entries_for_key('source', source, tables_with_source)

        # collect gene caller ids that were added to the db via the HMM source
        gene_caller_ids_to_remove = set(
            key for key, val in self.gene_calls_dict.items()
            if val['source'] == source)

        # if there are any, remove them from tables with 'gene_callers_id' column
        if len(gene_caller_ids_to_remove):
            database = db.DB(self.db_path,
                             utils.get_required_version_for_db(self.db_path))

            CLAUSE = "gene_callers_id in (%s)" % (','.join(
                [str(x) for x in gene_caller_ids_to_remove]))
            for table in tables_with_gene_callers_id:
                database.remove_some_rows_from_table(table, CLAUSE)

            database.disconnect()

            run.warning("%d gene caller ids that were added via the HMM source have been removed from \"%s\"" \
                        % (len(gene_caller_ids_to_remove), ', '.join(tables_with_gene_callers_id)))
Beispiel #12
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_pan_db(db_path)

    # make sure the version is accurate
    pan_db = db.DB(db_path, None, ignore_version = True)
    if str(pan_db.get_version()) != current_version:
        raise ConfigError("Version of this pan database is not %s (hence, this script cannot really do anything)." % current_version)

    # migrate item orders
    item_orders = pan_db.get_table_as_dict(item_orders_table_name)
    for order_name in item_orders:
        if item_orders[order_name]['type'] == 'newick':
            newick = Tree(item_orders[order_name]['data'], format=1)
            newick = newick.write(format=2)
            pan_db._exec("""UPDATE %s SET "data" = ? WHERE "name" LIKE ?""" % item_orders_table_name, (newick, order_name))

    # migrate layer orders
    layer_orders = pan_db.get_table_as_dict(layer_orders_table_name)
    for order_name in layer_orders:
        if layer_orders[order_name]['data_type'] == 'newick':
            newick = Tree(layer_orders[order_name]['data_value'], format=1)
            newick = newick.write(format=2)
            pan_db._exec("""UPDATE %s SET "data_value" = ? WHERE "data_key" LIKE ?""" % layer_orders_table_name, (newick, order_name))

    # set the version
    pan_db.remove_meta_key_value_pair('version')
    pan_db.set_version(next_version)

    # now bye for real!
    pan_db.disconnect()

    progress.end()

    run.info_single('Your pan db is now %s.' % next_version, nl_after=1, nl_before=1, mc='green')
Beispiel #13
0
    def __init__(self,
                 contigs_db_path,
                 sources=set([]),
                 run=run,
                 progress=progress):
        if not isinstance(sources, type(set([]))):
            raise ConfigError, "'sources' variable has to be a set instance."

        self.sources = set([s for s in sources if s])

        # take care of contigs db related stuff and move on:
        contigs_db = db.DB(contigs_db_path, anvio.__contigs__version__)
        self.hmm_hits = contigs_db.get_table_as_dict(t.hmm_hits_table_name)
        self.hmm_hits_info = contigs_db.get_table_as_dict(
            t.hmm_hits_info_table_name)
        self.hmm_hits_splits = contigs_db.get_table_as_dict(
            t.hmm_hits_splits_table_name)
        self.contig_sequences = contigs_db.get_table_as_dict(
            t.contig_sequences_table_name, string_the_key=True)
        self.genes_in_contigs = contigs_db.get_table_as_dict(
            t.genes_in_contigs_table_name)
        contigs_db.disconnect()

        missing_sources = [
            s for s in self.sources if s not in self.hmm_hits_info
        ]
        if len(missing_sources):
            raise ConfigError, 'Some of the requested sources were not found in the contigs database :/\
                                Here is a list of the ones that are missing: %s' % ', '.join(
                missing_sources)

        if len(self.sources):
            self.hmm_hits_splits = utils.get_filtered_dict(
                self.hmm_hits_splits, 'source', self.sources)
            self.hmm_hits = utils.get_filtered_dict(self.hmm_hits, 'source',
                                                    self.sources)
        else:
            self.sources = self.hmm_hits_info.keys()
Beispiel #14
0
    def __init__(self,
                 db_path,
                 version,
                 run=run,
                 progress=progress,
                 quiet=False,
                 simple=False):
        self.quiet = quiet
        self.db_type = None
        self.db_path = db_path
        self.version = version
        self.next_available_id = {}

        self.splits_info = None
        self.contigs_info = None
        self.split_length = None
        self.genes_are_called = None

        self.run = run
        self.progress = progress

        database = db.DB(self.db_path, version, ignore_version=True)
        self.db_type = database.get_meta_value('db_type')
Beispiel #15
0
    def get_bins_info_dict(self, collection_name):
        self.sanity_check(collection_name)

        c = self.collections_dict[collection_name]

        database = db.DB(c['source_db_path'], c['source_db_version'])
        collections_bins_info_table = database.get_table_as_dict(
            t.collections_bins_info_table_name)
        database.disconnect()

        # FIXME: this could be resolved with a WHERE clause in the SQL query:
        collections_bins_info_table_filtered = utils.get_filtered_dict(
            collections_bins_info_table, 'collection_name',
            set([collection_name]))

        bins_info_dict = {}
        for v in list(collections_bins_info_table_filtered.values()):
            bins_info_dict[v['bin_name']] = {
                'html_color': v['html_color'],
                'source': v['source']
            }

        return bins_info_dict
Beispiel #16
0
    def add(self, blastp_search_output):
        """Incrementally adds new hits to a contigs database.

           It is essential to run the member function `update_db_self_table_values` once adding new hits are complete.
           At the time of writing this class w couldn't find a better way to do it.
        """

        self.database = db.DB(self.db_path,
                              utils.get_required_version_for_db(self.db_path))

        entries = []
        for gene_callers_id, scg_name, scg_hits in blastp_search_output:
            # go back if there is nothing to do
            if not len(scg_hits):
                continue

            for scg_hit in scg_hits:
                entries.append(
                    [gene_callers_id, scg_name] +
                    [scg_hit[f] for f in t.scg_taxonomy_table_structure[2:]])

        self.database.insert_many(t.scg_taxonomy_table_name, entries)
        self.database.disconnect()
Beispiel #17
0
    def populate_genes_in_contigs_table(self, gene_calls_dict, amino_acid_sequences, append_to_the_db=False):
        utils.is_contigs_db(self.db_path)
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))

        if not append_to_the_db:
            database._exec('''DELETE FROM %s''' % (t.genes_in_contigs_table_name))
            database._exec('''DELETE FROM %s''' % (t.gene_amino_acid_sequences_table_name))
        else:
            # so we are in the append mode. We must remove all the previous entries from genes in contigs
            # that matches to the incoming sources. otherwise we may end up with many duplicates in the db.
            sources = set([v['source'] for v in gene_calls_dict.values()])

            # basically here we will go through those sources, find gene caller ids associated with them in
            # the genes in contigs table, and then remove entries for those gene caller ids both from the
            # genes in contigs and genes in splits tables.
            for source in sources:
                gene_caller_ids_for_source = database.get_single_column_from_table(t.genes_in_contigs_table_name,
                                                                                   'gene_callers_id',
                                                                                   where_clause="""source='%s'""" % source)

                if gene_caller_ids_for_source:
                    for table_name in [t.genes_in_contigs_table_name, t.genes_in_splits_table_name]:
                        database._exec('''DELETE FROM %s WHERE gene_callers_id IN (%s)''' % \
                                                    (table_name, ','.join([str(g) for g in gene_caller_ids_for_source])))

        self.progress.new('Processing')
        self.progress.update('Entering %d gene calls into the db ...' % (len(gene_calls_dict)))

        db_entries = [tuple([gene_callers_id] + [gene_calls_dict[gene_callers_id][h] for h in t.genes_in_contigs_table_structure[1:]]) for gene_callers_id in gene_calls_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?,?,?,?,?)''' % t.genes_in_contigs_table_name, db_entries)

        db_entries = [tuple([gene_callers_id, amino_acid_sequences[gene_callers_id] if gene_callers_id in amino_acid_sequences else '']) for gene_callers_id in gene_calls_dict]
        database._exec_many('''INSERT INTO %s VALUES (?,?)''' % t.gene_amino_acid_sequences_table_name, db_entries)

        self.progress.end()

        database.disconnect()
Beispiel #18
0
    def check_sources(self, sources):

        if self.add_to_functions_table: # check that source is not already in gene_functions table
            gene_function_sources_in_db = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('gene_function_sources')
            sources_in_db = set(gene_function_sources_in_db.split(',') if gene_function_sources_in_db else [])
            sources_need_to_be_removed = set(sources.keys()).intersection(sources_in_db)

            if len(sources_need_to_be_removed):
                source_string = ', '.join(sources_need_to_be_removed)
                raise ConfigError("Some of the HMM sources are already in the gene functions table in the database and anvi'o "
                                  "doesn't want to overwrite them. If YOU want to overwrite them, however, (because you do you, "
                                  "friend) you can do that by "
                                  "running `anvi-delete-functions` first, and then re-running this program. Here are the sources "
                                  f"that you would need to delete: {source_string}")
        else: # default checks for hmm_hits table
            sources_in_db = list(hmmops.SequencesForHMMHits(self.db_path).hmm_hits_info.keys())

            if 'Ribosomal_RNAs' in sources_in_db and len([s for s in sources if s.startswith('Ribosomal_RNA_')]):
                raise ConfigError("Here is one more additional step we need to you take care of before we can go forward: Your contigs database "
                                  "already contains HMMs from an older `Ribosomal_RNAs` model anvi'o no longer uses AND you are about to run "
                                  "its newer models that do the same thing (but better). Since Ribosomal RNA models add new gene calls to the "
                                  "database, running newer models without first cleaning up the old ones will result in duplication of gene calls "
                                  "as examplified here: https://github.com/merenlab/anvio/issues/1598. Anvi'o could've removed the `Ribosomal_RNAs` "
                                  "model for you automatically, but the wisdom tells us that the person who passes the sentence should swing the "
                                  "sword. Here it is for your grace: \"anvi-delete-hmms -c CONTIGS.db --hmm-source Ribosomal_RNAs\".")

            sources_need_to_be_removed = set(sources.keys()).intersection(sources_in_db)

            if len(sources_need_to_be_removed):
                if self.just_do_it:
                    for source_name in sources_need_to_be_removed:
                        self.remove_source(source_name)
                else:
                    raise ConfigError("Some of the HMM sources you wish to run on this database are already in the database and anvi'o "
                                      "refuses to overwrite them without your explicit input. You can either use `anvi-delete-hmms` "
                                      "to remove them first, or run this program with `--just-do-it` flag so anvi'o would remove all "
                                      "for you. Here are the list of HMM sources that need to be removed: '%s'." % (', '.join(sources_need_to_be_removed)))
Beispiel #19
0
    def get(self):
        """Will return the additional data keys and the dict."""

        self.progress.new('Recovering additional keys and data for %s' % self.target)
        self.progress.update('...')
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        additional_data = database.get_table_as_dict(self.table_name)
        additional_data_keys = database.get_single_column_from_table(self.table_name, 'data_key', unique=True)
        additional_data_item_names = database.get_single_column_from_table(self.table_name, 'item_name', unique=True)
        database.disconnect()

        if not len(additional_data_item_names):
            self.progress.end()
            return [], {}

        d = {}
        for additional_data_item_name in additional_data_item_names:
            d[additional_data_item_name] = {}

        for entry in additional_data.values():
            additional_data_item_name = entry['item_name']
            key = entry['data_key']
            value = entry['data_value']

            if entry['data_type'] in ['int', 'float']:
                d[additional_data_item_name][key] = eval(entry['data_type'])(value or self.nulls_per_type[entry['data_type']])
            else:
                d[additional_data_item_name][key] = value

        for additional_data_item_name in d:
            for key in additional_data_keys:
                if key not in d[additional_data_item_name]:
                    d[additional_data_item_name][key] = None

        self.progress.end()

        return additional_data_keys, d
Beispiel #20
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version=True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this contigs database is not %s (hence, this script cannot really do anything)."
            % current_version)

    progress.new("Dropping the HMMs ")
    progress.update("...")
    for table_name in ['hmm_hits_info', 'hmm_hits', 'hmm_hits_in_splits']:
        contigs_db.remove_some_rows_from_table(
            table_name,
            'source IN ("Rinke_et_al", "Campbell_et_al", "BUSCO_83_Protista")')

    progress.update("Updating version")
    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)

    progress.update("Committing changes")
    contigs_db.disconnect()

    progress.end()
    run.info_single(
        "The contigs database is now %s. Unfortunately this update removed ALL SINGLE-COPY CORE GENE\
                     HMMs FROM YOUR CONTIGS DATABASE :( We are very sorry about this, but we only did it to be\
                     able to offer you nicer things. It is best if you re-run `anvi-run-hmms` program from scratch.\
                     Doing that will not remove any 'non-default' HMM profiles you may have added in this contigs\
                     database, so you have nothing to worry about." %
        (next_version),
        nl_after=1,
        nl_before=1,
        mc='green')
Beispiel #21
0
    def check_params(self):
        """Make sure params to generate gene-level stats match across the board"""
        database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
        non_matching_parameters = []
        for parameter in self.parameters:
            try:
                parameter_in_db = database.get_meta_value(parameter)
            except:
                database.disconnect()
                raise ConfigError("Bad news of the day: You have a genes database for the collection %s and bin %s. But "
                                  "clearly the parameters you used to generate these gene-level coverage data has little "
                                  "to do with the parameters you are using now. For instance, parameter '%s' was not even "
                                  "stored in the database :/" % \
                                        (self.collection_name, self.bin_name , str(parameter)))

            parameter_user_set = self.parameters[parameter]
            try:
                parameter_in_db, parameter_user_set = float(parameter_in_db), float(parameter_user_set)
            except:
                pass

            if parameter_in_db != parameter_user_set:
                non_matching_parameters.append((parameter, parameter_in_db, parameter_user_set))

        if len(non_matching_parameters):
            e = non_matching_parameters[0]
            database.disconnect()
            raise ConfigError("OK. You have a genes database for the collection %s and bin %s. But %d "
                              "of the parameters you used to generate these gene-level coverage data is not "
                              "matching the matching parameters you are using now. For instance, the database "
                              "has %s for %s, but the same parameter is currently set to %s in your workflow. "
                              "The best solution to this is to remove this database (which is at '%s'), and let "
                              "anvi'o generate another one for you." % \
                                    (self.collection_name, self.bin_name, len(non_matching_parameters), str(e[1]),
                                    e[0], str(e[2]), self.db_path))

        database.disconnect()
Beispiel #22
0
    def __init__(self, db_path, version, run=terminal.Run(), progress=terminal.Progress(), quiet=False, simple=False):
        if not db_path:
            raise ConfigError("Table superclass is being initiated without a db path, and it is very "
                               "very concerning :( Anvi'o needs an adult.")

        if not os.path.exists(db_path):
            raise ConfigError("Database ('%s') does not exist. You must create one first." % db_path)

        self.quiet = quiet
        self.db_type = None
        self.db_path = db_path
        self.version = version
        self.next_available_id = {}

        self.splits_info = None
        self.contigs_info = None
        self.split_length = None
        self.genes_are_called = None

        self.run = run
        self.progress = progress

        database = db.DB(self.db_path, version)
        self.db_type = database.get_meta_value('db_type')

        if not simple and self.db_type == 'contigs':
            # FIXME: a better design is required. the salient point is, "Table" must serve for both profile db
            # and contigs db calls.
            self.split_length = database.get_meta_value('split_length')
            self.genes_are_called = database.get_meta_value('genes_are_called')
            self.contigs_info = database.get_table_as_dict(t.contigs_info_table_name, string_the_key=True)
            self.splits_info = database.get_table_as_dict(t.splits_info_table_name)
            self.contig_name_to_splits = utils.get_contig_name_to_splits_dict(self.splits_info, self.contigs_info)
            self.gene_calls_dict = None

        database.disconnect()
Beispiel #23
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_profile_db(db_path)

    # make sure the version is accurate
    profile_db = db.DB(db_path, None, ignore_version=True)
    if str(profile_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this profile database is not %s (hence, this script cannot really do anything)."
            % current_version)

    profile_db._exec(
        'ALTER TABLE "item_additional_data" ADD COLUMN "data_group" text')
    profile_db._exec(
        'ALTER TABLE "layer_additional_data" ADD COLUMN "data_group" text')

    profile_db._exec(
        'UPDATE "item_additional_data" SET "data_group" = "default"')
    profile_db._exec(
        'UPDATE "layer_additional_data" SET "data_group" = "default"')

    # set the version
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version(next_version)

    # bye
    profile_db.disconnect()
    progress.end()

    run.info_single('Your profile db is now %s, and you rock.' % next_version,
                    nl_after=1,
                    nl_before=1,
                    mc='green')
Beispiel #24
0
    def __init__(self,
                 db_path,
                 db_hash,
                 create_new=False,
                 ignore_hash=False,
                 run=run,
                 progress=progress,
                 quiet=False):
        self.db_type = 'auxiliary data for coverages'
        self.db_hash = str(db_hash)
        self.version = anvio.__auxiliary_data_version__
        self.db_path = db_path
        self.quiet = quiet
        self.run = run
        self.progress = progress
        self.coverage_entries = []

        self.db = db.DB(self.db_path, self.version, new_database=create_new)

        if create_new:
            self.create_tables()

        if not ignore_hash:
            self.check_hash()
Beispiel #25
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    # make sure someone is not being funny
    utils.is_profile_db(db_path)

    # make sure the version is accurate
    profile_db = db.DB(db_path, None, ignore_version=True)
    if str(profile_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this profile database is not %s (hence, this script cannot really do anything)."
            % current_version)

    profile_db._exec('ALTER TABLE "item_orders" ADD COLUMN "additional" text')
    profile_db._exec('UPDATE "item_orders" SET "additional" = "{}"')

    # set the version
    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version(next_version)

    # bye
    profile_db.disconnect()
    progress.end()
Beispiel #26
0
    def __init__(self, args):
        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.db_path = A('pan_or_profile_db') or A('profile_db') or A('pan_db')
        self.just_do_it = A('just_do_it')

        if not self.db_path:
            raise ConfigError(
                "The AdditionalAndOrderDataBaseClass is inherited with an args object that did not\
                               contain any database path :/ Even though any of the following would\
                               have worked: `pan_or_profile_db`, `profile_db`, `pan_db` :("
            )

        if not self.table_name:
            raise ConfigError(
                "The AdditionalAndOrderDataBaseClass does not know anything about the table it should\
                               be working with.")

        utils.is_pan_or_profile_db(self.db_path)
        self.db_type = utils.get_db_type(self.db_path)
        self.db_version = utils.get_required_version_for_db(self.db_path)

        database = db.DB(self.db_path, self.db_version)
        self.additional_data_keys = database.get_single_column_from_table(
            self.table_name, 'data_key')
        database.disconnect()

        Table.__init__(self, self.db_path, self.db_version, self.run,
                       self.progress)

        self.nulls_per_type = {
            'str': '',
            'int': 0,
            'float': 0,
            'stackedbar': None,
            'unknown': None
        }
Beispiel #27
0
    def update_db_self_table_values(self,
                                    taxonomy_was_run=False,
                                    database_version=None):
        """Updates the self table in contigs db.

        The purpose of this function is to clarify whether scg taxonomy was run for a contigs
        database, and if yes, which version of the local database was used to keep track of
        versions.

        Paremeters
        ==========
        taxonomy_was_run: bool, False
            Set True if taxonomy was run successfuly.
        database_version: str, None
            This sould be read from the ctx.target_database_release in taxonomyops.
        """

        self.database = db.DB(self.db_path,
                              utils.get_required_version_for_db(self.db_path))
        self.database.update_meta_value("scg_taxonomy_was_run",
                                        taxonomy_was_run)
        self.database.update_meta_value("scg_taxonomy_database_version",
                                        database_version)
        self.database.disconnect()
Beispiel #28
0
    def add(self, data_dict, data_keys_list, skip_check_names=False):
        key_types = {}
        for key in data_keys_list:
            if '!' in key:
                predicted_key_type = "stackedbar"
            else:
                type_class = utils.get_predicted_type_of_items_in_a_dict(data_dict, key)
                predicted_key_type = type_class.__name__ if type_class else None

            key_types[key] = predicted_key_type

        db_entries = []
        self.set_next_available_id(self.table_name)
        for item_name in data_dict:
            for key in data_dict[item_name]:
                db_entries.append(tuple([self.next_id(self.table_name),
                                         item_name,
                                         key,
                                         data_dict[item_name][key],
                                         key_types[key]]))

        database = db.DB(self.db_path, None, ignore_version=True)
        database._exec_many('''INSERT INTO %s VALUES (?,?,?,?,?)''' % self.table_name, db_entries)
        database.disconnect()
Beispiel #29
0
    def populate_collections_dict(self, db_path):
        filesnpaths.is_file_exists(db_path)
        self.db_path = db_path

        database = db.DB(db_path, dbops.get_required_version_for_db(db_path))
        self.db_type = database.get_meta_value('db_type')
        collections_info_table = database.get_table_as_dict(t.collections_info_table_name)
        database.disconnect()

        # collections info must be read only if its coming from the contigs database.
        if self.db_type == 'contigs':
            read_only = True
        elif self.db_type == 'profile':
            read_only = False
        elif self.db_type == 'pan':
            read_only = False
        else:
            raise ConfigError('Collections class does not know about this "%s" database type :/' % self.db_type)

        for collection_name in collections_info_table:
            self.collections_dict[collection_name] = collections_info_table[collection_name]
            self.collections_dict[collection_name]['read_only'] = read_only
            self.collections_dict[collection_name]['source_db_path'] = db_path
            self.collections_dict[collection_name]['source_db_version'] = dbops.get_required_version_for_db(db_path)
Beispiel #30
0
    def __init__(self, db_path, num_threads_to_use=1, run=run, progress=progress, initializing_for_deletion=False, just_do_it=False,
                 hmm_program_to_use='hmmscan', hmmer_output_directory=None, get_domain_table_output=False, add_to_functions_table=False):
        self.num_threads_to_use = num_threads_to_use
        self.db_path = db_path
        self.just_do_it = just_do_it
        self.hmm_program = hmm_program_to_use or 'hmmscan'
        self.hmmer_output_dir = hmmer_output_directory
        self.hmmer_desired_output = ('table', 'domtable') if get_domain_table_output else 'table'
        self.add_to_functions_table = add_to_functions_table

        utils.is_contigs_db(self.db_path)
        filesnpaths.is_program_exists(self.hmm_program)

        self.contigs_db_hash = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path)).get_meta_value('contigs_db_hash')

        Table.__init__(self, self.db_path, anvio.__contigs__version__, run, progress)

        self.init_gene_calls_dict()

        if not len(self.gene_calls_dict):
            if self.genes_are_called:
                self.run.warning("Tables in this contigs database that should contain gene calls are empty despite the fact that "
                                 "you didn't skip the gene calling step while generating this contigs database. This probably means "
                                 "that the gene caller did not find any genes among contigs. This is OK for now. But might explode "
                                 "later. If it does explode and you decide to let us know about that problem, please remember to mention "
                                 "this warning. By the way, this warning probably has been seen by like only 2 people on the planet. Who "
                                 "works with contigs with no gene calls? A better implementation of anvi'o will unite researchers who "
                                 "study weird stuff.")
            else:
                self.run.warning("It seems you have skipped gene calling step while generating your contigs database, and you have no "
                                 "genes calls in tables that should contain gene calls. Anvi'o will let you go with this since some HMM "
                                 "sources only operate on DNA sequences, and at this point it doesn't know which HMMs you wish to run. "
                                 "If the lack of genes causes a problem, you will get another error message later probably :/")

        if not initializing_for_deletion:
            self.set_next_available_id(t.hmm_hits_table_name)