Example #1
0
def migrate(db_path):

    utils.check_h5py_module()
    import h5py

    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_contigs_db(db_path)

    contigs_db = db.DB(db_path, None, ignore_version = True)
    if str(contigs_db.get_version()) != current_version:
        raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version)

    auxiliary_path = ''.join(db_path[:-3]) + '.h5'

    if not os.path.exists(auxiliary_path):
        raise ConfigError("%s, the target of this script does not seem to be where it should have been :/" % auxiliary_path)

    fp = h5py.File(auxiliary_path, 'r')

    contigs_db.create_table(nt_position_info_table_name, nt_position_info_table_structure, nt_position_info_table_types)

    contig_names_in_db = list(fp['/data/nt_position_info'].keys())

    run.info("Auxiliary data file found", auxiliary_path)
    run.info("Contigs found", len(contig_names_in_db))

    progress.new('Processing the auxiliary data file')
    counter, total = 0, len(contig_names_in_db)

    entries = []
    for contig_name in contig_names_in_db:
        entries.append((contig_name, convert_numpy_array_to_binary_blob(fp['/data/nt_position_info/%s' % (contig_name)].value),))

        counter += 1
        progress.update('contig %d of %d ...' % (counter, total))

        if counter % 10 == 0:
            progress.update("Writing buffer to the new database ...")
            contigs_db.insert_many(nt_position_info_table_name, entries=entries)
            entries = []

    contigs_db.insert_many(nt_position_info_table_name, entries=entries)

    progress.end()
    fp.close()

    # we also want to upgrade this table name, which was renamed within #654 re:
    # merenlab/pc_to_gene_cluster PR
    contigs_db._exec('ALTER TABLE gene_protein_sequences RENAME TO gene_amino_acid_sequences;')

    contigs_db.remove_meta_key_value_pair('version')
    contigs_db.set_version(next_version)
    contigs_db.disconnect()

    os.remove(auxiliary_path)

    run.info_single("The contigs database is now %s, and the now-obsolete '.h5' file is gone forever "
                    "and ever." % (next_version), nl_after=1, nl_before=1, mc='green')
Example #2
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.check_h5py_module()
    import h5py

    fp = h5py.File(db_path, 'a')

    if int(fp.attrs['version']) != int(current_version):
        fp.close()
        raise ConfigError("Genome storage version is not %s." %
                          current_version)

    progress.new('Upgrading genome storage')
    genomes = fp['/data/genomes/'].keys()
    for genome in genomes:
        gene_caller_ids = fp['/data/genomes/%s/' % genome].keys()
        for genome_caller_id in gene_caller_ids:
            fp.move(
                '/data/genomes/%s/%s/sequence' % (genome, genome_caller_id),
                '/data/genomes/%s/%s/aa_sequence' % (genome, genome_caller_id))
            fp['/data/genomes/%s/%s/dna_sequence' %
               (genome, genome_caller_id)] = ''
            progress.update('Upgrading genome "%s" and gene caller id "%s"' %
                            (genome, genome_caller_id))
    progress.end()

    fp.attrs['version'] = next_version
    fp.close()

    run.info_single(
        'Your pan db is now %s  (if this process seems to be stuck here, and you are not seeing new lines,\
                     you can kill this process by pressing CTRL + C once and things will likely continue just as expected\
                     --for some reason in some cases the process just hangs, and we have not been able to identify the\
                     problem).' % next_version,
        nl_after=1,
        nl_before=1,
        mc='green')
Example #3
0
#!/usr/bin/env python
# -*- coding: utf-8

import os
import sys
import gzip
import argparse

import anvio.db as db
import anvio.utils as utils
import anvio.terminal as terminal

from anvio.errors import ConfigError

utils.check_h5py_module()
import h5py

current_version = "9"
next_version = "10"

run = terminal.Run()
progress = terminal.Progress()

nt_position_info_table_name = 'nt_position_info'
nt_position_info_table_structure = ['contig_name', 'position_info']
nt_position_info_table_types = ['str', 'blob']


def convert_numpy_array_to_binary_blob(array, compress=True):
    if compress:
        return gzip.compress(memoryview(array), compresslevel=1)
Example #4
0
def migrate(db_path):

    utils.check_h5py_module()
    import h5py

    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.is_profile_db(db_path)

    profile_db = db.DB(db_path, None, ignore_version=True)
    if str(profile_db.get_version()) != current_version:
        raise ConfigError(
            "Version of this profile database is not %s (hence, this script cannot really do anything)."
            % current_version)

    auxiliary_path = os.path.join(os.path.dirname(db_path),
                                  'AUXILIARY-DATA.h5')
    new_auxiliary_path = os.path.join(os.path.dirname(db_path),
                                      'AUXILIARY-DATA.db')

    if os.path.exists(auxiliary_path):
        fp = h5py.File(auxiliary_path, 'r')
        G = lambda x: fp.attrs[x].decode('utf-8') if isinstance(
            fp.attrs[x], bytes) else fp.attrs[x]
        auxiliary_db = db.DB(new_auxiliary_path, '2', new_database=True)

        auxiliary_db.set_meta_value('db_type', 'auxiliary data for coverages')
        auxiliary_db.set_meta_value('contigs_db_hash', G('hash'))
        auxiliary_db.set_meta_value('creation_date', time.time())
        auxiliary_db.create_table(split_coverages_table_name,
                                  split_coverages_table_structure,
                                  split_coverages_table_types)
        auxiliary_db._exec(
            """CREATE INDEX IF NOT EXISTS covering_index ON %s(split_name, sample_name)"""
            % (split_coverages_table_name))

        sample_names_in_db = set(
            list(list(fp['/data/coverages'].values())[0].keys()))
        split_names_in_db = list(fp['/data/coverages'].keys())

        run.info("Auxiliary data file found", auxiliary_path)
        run.info("Splits found", len(split_names_in_db))
        run.info("Samples found", len(sample_names_in_db))
        run.info("New auxiliary data path", new_auxiliary_path)

        progress.new('Processing auxiliary')
        counter, total = 0, len(sample_names_in_db)

        entries = []
        for sample_name in sample_names_in_db:
            for split_name in split_names_in_db:
                entries.append((
                    split_name,
                    sample_name,
                    convert_numpy_array_to_binary_blob(
                        fp['/data/coverages/%s/%s' %
                           (split_name, sample_name)].value),
                ))

            counter += 1
            progress.update('sample %d of %d ...' % (counter, total))

            if counter % 10 == 0:
                progress.update("Writing buffer into a new database file ...")
                auxiliary_db.insert_many(split_coverages_table_name,
                                         entries=entries)
                entries = []

        auxiliary_db.insert_many(split_coverages_table_name, entries=entries)

        progress.end()
        auxiliary_db.disconnect()
        fp.close()

        os.remove(auxiliary_path)
        fully_upgraded = True
    else:
        fully_upgraded = False

    # we also added a totally new table to this version:
    profile_db.create_table(item_additional_data_table_name,
                            item_additional_data_table_structure,
                            item_additional_data_table_types)

    profile_db.remove_meta_key_value_pair('version')
    profile_db.set_version(next_version)
    profile_db.disconnect()

    if fully_upgraded:
        run.info_single("Your profile db is now version %s. Anvi'o just created a new, up-to-date auxiliary data file (which ends with "
                        "extension .db), and deleted the old one (the one that ended with the extension .h5))" \
                                                            % next_version, nl_after=1, nl_before=1, mc='green')
    else:
        run.info_single("Your profile db is now version %s. BUT THERE WAS THIS: the actual purpose of this script was to upgrade your "
                        "AUXILIARY-DATA.h5 file, but it was not where it was supposed to be. Anvi'o upgraded your profile.db alone, "
                        "but as a consequence you will not be able to use its auxiliary data with this profile database. If you care "
                        "about it, you should find the old profile database, and upgrade it along with its auxiliary data" \
                                                            % next_version, nl_after=1, nl_before=1, mc='green')
Example #5
0
def migrate(db_path):
    if db_path is None:
        raise ConfigError("No database path is given.")

    utils.check_h5py_module()
    import h5py

    fp = h5py.File(db_path, 'r')

    if int(fp.attrs['version']) != int(current_version):
        fp.close()
        raise ConfigError("Genome storage version is not %s." % current_version)

    old_storage_hash = str(fp.attrs['hash'])
    functions_are_available = fp.attrs['functions_are_available']

    run.info("Outdated genomes storage found (%s)" % old_storage_hash, db_path)

    genome_storage_db_path = db_path[:-3] + '.db'
    filesnpaths.is_output_file_writable(genome_storage_db_path, ok_if_exists=False)

    genomes_db = db.DB(genome_storage_db_path, next_version, new_database=True)
    genomes_db.create_table(genome_info_table_name, genome_info_table_structure, genome_info_table_types)
    genomes_db.create_table(gene_info_table_name, gene_info_table_structure, gene_info_table_types)
    genomes_db.create_table(genome_gene_function_calls_table_name, genome_gene_function_calls_table_structure, genome_gene_function_calls_table_types)
    genomes_db._exec("CREATE INDEX covering_index ON %s (gene_callers_id, genome_name);" % genome_gene_function_calls_table_name)

    genomes_db.set_meta_value('db_type', 'genomestorage')
    genomes_db.set_meta_value('creation_date', time.time())
    genomes_db.set_meta_value('hash', old_storage_hash)
    genomes_db.set_meta_value('functions_are_available', functions_are_available)

    I = lambda genome_name, key: fp['/info/genomes/%s/%s' % (genome_name, key)]

    genome_names = [d for d in fp['/info/genomes']]

    progress.new('Bleep bloop')
    progress.update('Adding genomes')
    genome_info_entries = []
    for genome_name in genome_names:
        values = (genome_name, )

        for column_name in genome_info_table_structure[1:]:
            # dirty workaround for backwards compatibility,
            # "percent_completion" may be "percent_complete" in some old genome storages, 
            # because ozcan forgot to add that into upgrade script :(
            if column_name == 'percent_completion' and '/info/genomes/%s/percent_completion' % genome_name not in fp:
                column_name = 'percent_complete'

            attr = I(genome_name, column_name)

            if attr.dtype == 'int64':
                values += (int(attr.value), )
            elif attr.dtype == 'float64':
                values += (float(attr.value), )
            else:
                values += ((attr.value), )

        genome_info_entries.append(values)
    genomes_db.insert_many(genome_info_table_name, entries=genome_info_entries)
    del genome_info_entries

    progress.update('Adding genes')
    gene_entries = []
    for genome_name in genome_names:
        for gene_callers_id in fp['/data/genomes/%s' % genome_name]:
            G = lambda key: fp['/data/genomes/%s/%s/%s' % (genome_name, gene_callers_id, key)].value
            gene_entries.append((genome_name, gene_callers_id, G('aa_sequence'), G('dna_sequence'), int(G('partial')), int(G('length')), ))
    genomes_db.insert_many(gene_info_table_name, entries=gene_entries)
    del gene_entries

    progress.update('Adding functions')
    functions_entries = []
    entry_id_counter = 0
    for genome_name in genome_names:
        for gene_callers_id in fp['/data/genomes/%s' % genome_name]:
            functions_path = '/data/genomes/%s/%s/functions' % (genome_name, gene_callers_id)
            if functions_path in fp:
                for source in fp[functions_path]:
                    annotation_list = str(fp['/data/genomes/%s/%s/functions/%s' % (genome_name, gene_callers_id, source)].value).split('|||')

                    functions_entries.append((genome_name, entry_id_counter, gene_callers_id, source, annotation_list[0], annotation_list[1], 0, ))
                    entry_id_counter += 1
    genomes_db.insert_many(genome_gene_function_calls_table_name, entries=functions_entries)

    genomes_db.disconnect()

    progress.end()

    os.remove(db_path)

    run.info_single("Your genomes storage is now at version %s. The new on is at %s, and anvi'o just removed\
                     the old one, which was at %s from your disk." % (next_version, genome_storage_db_path, db_path), nl_after=1, nl_before=1, mc='green')