def migrate(db_path): utils.check_h5py_module() import h5py if db_path is None: raise ConfigError("No database path is given.") utils.is_contigs_db(db_path) contigs_db = db.DB(db_path, None, ignore_version = True) if str(contigs_db.get_version()) != current_version: raise ConfigError("Version of this contigs database is not %s (hence, this script cannot really do anything)." % current_version) auxiliary_path = ''.join(db_path[:-3]) + '.h5' if not os.path.exists(auxiliary_path): raise ConfigError("%s, the target of this script does not seem to be where it should have been :/" % auxiliary_path) fp = h5py.File(auxiliary_path, 'r') contigs_db.create_table(nt_position_info_table_name, nt_position_info_table_structure, nt_position_info_table_types) contig_names_in_db = list(fp['/data/nt_position_info'].keys()) run.info("Auxiliary data file found", auxiliary_path) run.info("Contigs found", len(contig_names_in_db)) progress.new('Processing the auxiliary data file') counter, total = 0, len(contig_names_in_db) entries = [] for contig_name in contig_names_in_db: entries.append((contig_name, convert_numpy_array_to_binary_blob(fp['/data/nt_position_info/%s' % (contig_name)].value),)) counter += 1 progress.update('contig %d of %d ...' % (counter, total)) if counter % 10 == 0: progress.update("Writing buffer to the new database ...") contigs_db.insert_many(nt_position_info_table_name, entries=entries) entries = [] contigs_db.insert_many(nt_position_info_table_name, entries=entries) progress.end() fp.close() # we also want to upgrade this table name, which was renamed within #654 re: # merenlab/pc_to_gene_cluster PR contigs_db._exec('ALTER TABLE gene_protein_sequences RENAME TO gene_amino_acid_sequences;') contigs_db.remove_meta_key_value_pair('version') contigs_db.set_version(next_version) contigs_db.disconnect() os.remove(auxiliary_path) run.info_single("The contigs database is now %s, and the now-obsolete '.h5' file is gone forever " "and ever." % (next_version), nl_after=1, nl_before=1, mc='green')
def migrate(db_path): if db_path is None: raise ConfigError("No database path is given.") utils.check_h5py_module() import h5py fp = h5py.File(db_path, 'a') if int(fp.attrs['version']) != int(current_version): fp.close() raise ConfigError("Genome storage version is not %s." % current_version) progress.new('Upgrading genome storage') genomes = fp['/data/genomes/'].keys() for genome in genomes: gene_caller_ids = fp['/data/genomes/%s/' % genome].keys() for genome_caller_id in gene_caller_ids: fp.move( '/data/genomes/%s/%s/sequence' % (genome, genome_caller_id), '/data/genomes/%s/%s/aa_sequence' % (genome, genome_caller_id)) fp['/data/genomes/%s/%s/dna_sequence' % (genome, genome_caller_id)] = '' progress.update('Upgrading genome "%s" and gene caller id "%s"' % (genome, genome_caller_id)) progress.end() fp.attrs['version'] = next_version fp.close() run.info_single( 'Your pan db is now %s (if this process seems to be stuck here, and you are not seeing new lines,\ you can kill this process by pressing CTRL + C once and things will likely continue just as expected\ --for some reason in some cases the process just hangs, and we have not been able to identify the\ problem).' % next_version, nl_after=1, nl_before=1, mc='green')
#!/usr/bin/env python # -*- coding: utf-8 import os import sys import gzip import argparse import anvio.db as db import anvio.utils as utils import anvio.terminal as terminal from anvio.errors import ConfigError utils.check_h5py_module() import h5py current_version = "9" next_version = "10" run = terminal.Run() progress = terminal.Progress() nt_position_info_table_name = 'nt_position_info' nt_position_info_table_structure = ['contig_name', 'position_info'] nt_position_info_table_types = ['str', 'blob'] def convert_numpy_array_to_binary_blob(array, compress=True): if compress: return gzip.compress(memoryview(array), compresslevel=1)
def migrate(db_path): utils.check_h5py_module() import h5py if db_path is None: raise ConfigError("No database path is given.") utils.is_profile_db(db_path) profile_db = db.DB(db_path, None, ignore_version=True) if str(profile_db.get_version()) != current_version: raise ConfigError( "Version of this profile database is not %s (hence, this script cannot really do anything)." % current_version) auxiliary_path = os.path.join(os.path.dirname(db_path), 'AUXILIARY-DATA.h5') new_auxiliary_path = os.path.join(os.path.dirname(db_path), 'AUXILIARY-DATA.db') if os.path.exists(auxiliary_path): fp = h5py.File(auxiliary_path, 'r') G = lambda x: fp.attrs[x].decode('utf-8') if isinstance( fp.attrs[x], bytes) else fp.attrs[x] auxiliary_db = db.DB(new_auxiliary_path, '2', new_database=True) auxiliary_db.set_meta_value('db_type', 'auxiliary data for coverages') auxiliary_db.set_meta_value('contigs_db_hash', G('hash')) auxiliary_db.set_meta_value('creation_date', time.time()) auxiliary_db.create_table(split_coverages_table_name, split_coverages_table_structure, split_coverages_table_types) auxiliary_db._exec( """CREATE INDEX IF NOT EXISTS covering_index ON %s(split_name, sample_name)""" % (split_coverages_table_name)) sample_names_in_db = set( list(list(fp['/data/coverages'].values())[0].keys())) split_names_in_db = list(fp['/data/coverages'].keys()) run.info("Auxiliary data file found", auxiliary_path) run.info("Splits found", len(split_names_in_db)) run.info("Samples found", len(sample_names_in_db)) run.info("New auxiliary data path", new_auxiliary_path) progress.new('Processing auxiliary') counter, total = 0, len(sample_names_in_db) entries = [] for sample_name in sample_names_in_db: for split_name in split_names_in_db: entries.append(( split_name, sample_name, convert_numpy_array_to_binary_blob( fp['/data/coverages/%s/%s' % (split_name, sample_name)].value), )) counter += 1 progress.update('sample %d of %d ...' % (counter, total)) if counter % 10 == 0: progress.update("Writing buffer into a new database file ...") auxiliary_db.insert_many(split_coverages_table_name, entries=entries) entries = [] auxiliary_db.insert_many(split_coverages_table_name, entries=entries) progress.end() auxiliary_db.disconnect() fp.close() os.remove(auxiliary_path) fully_upgraded = True else: fully_upgraded = False # we also added a totally new table to this version: profile_db.create_table(item_additional_data_table_name, item_additional_data_table_structure, item_additional_data_table_types) profile_db.remove_meta_key_value_pair('version') profile_db.set_version(next_version) profile_db.disconnect() if fully_upgraded: run.info_single("Your profile db is now version %s. Anvi'o just created a new, up-to-date auxiliary data file (which ends with " "extension .db), and deleted the old one (the one that ended with the extension .h5))" \ % next_version, nl_after=1, nl_before=1, mc='green') else: run.info_single("Your profile db is now version %s. BUT THERE WAS THIS: the actual purpose of this script was to upgrade your " "AUXILIARY-DATA.h5 file, but it was not where it was supposed to be. Anvi'o upgraded your profile.db alone, " "but as a consequence you will not be able to use its auxiliary data with this profile database. If you care " "about it, you should find the old profile database, and upgrade it along with its auxiliary data" \ % next_version, nl_after=1, nl_before=1, mc='green')
def migrate(db_path): if db_path is None: raise ConfigError("No database path is given.") utils.check_h5py_module() import h5py fp = h5py.File(db_path, 'r') if int(fp.attrs['version']) != int(current_version): fp.close() raise ConfigError("Genome storage version is not %s." % current_version) old_storage_hash = str(fp.attrs['hash']) functions_are_available = fp.attrs['functions_are_available'] run.info("Outdated genomes storage found (%s)" % old_storage_hash, db_path) genome_storage_db_path = db_path[:-3] + '.db' filesnpaths.is_output_file_writable(genome_storage_db_path, ok_if_exists=False) genomes_db = db.DB(genome_storage_db_path, next_version, new_database=True) genomes_db.create_table(genome_info_table_name, genome_info_table_structure, genome_info_table_types) genomes_db.create_table(gene_info_table_name, gene_info_table_structure, gene_info_table_types) genomes_db.create_table(genome_gene_function_calls_table_name, genome_gene_function_calls_table_structure, genome_gene_function_calls_table_types) genomes_db._exec("CREATE INDEX covering_index ON %s (gene_callers_id, genome_name);" % genome_gene_function_calls_table_name) genomes_db.set_meta_value('db_type', 'genomestorage') genomes_db.set_meta_value('creation_date', time.time()) genomes_db.set_meta_value('hash', old_storage_hash) genomes_db.set_meta_value('functions_are_available', functions_are_available) I = lambda genome_name, key: fp['/info/genomes/%s/%s' % (genome_name, key)] genome_names = [d for d in fp['/info/genomes']] progress.new('Bleep bloop') progress.update('Adding genomes') genome_info_entries = [] for genome_name in genome_names: values = (genome_name, ) for column_name in genome_info_table_structure[1:]: # dirty workaround for backwards compatibility, # "percent_completion" may be "percent_complete" in some old genome storages, # because ozcan forgot to add that into upgrade script :( if column_name == 'percent_completion' and '/info/genomes/%s/percent_completion' % genome_name not in fp: column_name = 'percent_complete' attr = I(genome_name, column_name) if attr.dtype == 'int64': values += (int(attr.value), ) elif attr.dtype == 'float64': values += (float(attr.value), ) else: values += ((attr.value), ) genome_info_entries.append(values) genomes_db.insert_many(genome_info_table_name, entries=genome_info_entries) del genome_info_entries progress.update('Adding genes') gene_entries = [] for genome_name in genome_names: for gene_callers_id in fp['/data/genomes/%s' % genome_name]: G = lambda key: fp['/data/genomes/%s/%s/%s' % (genome_name, gene_callers_id, key)].value gene_entries.append((genome_name, gene_callers_id, G('aa_sequence'), G('dna_sequence'), int(G('partial')), int(G('length')), )) genomes_db.insert_many(gene_info_table_name, entries=gene_entries) del gene_entries progress.update('Adding functions') functions_entries = [] entry_id_counter = 0 for genome_name in genome_names: for gene_callers_id in fp['/data/genomes/%s' % genome_name]: functions_path = '/data/genomes/%s/%s/functions' % (genome_name, gene_callers_id) if functions_path in fp: for source in fp[functions_path]: annotation_list = str(fp['/data/genomes/%s/%s/functions/%s' % (genome_name, gene_callers_id, source)].value).split('|||') functions_entries.append((genome_name, entry_id_counter, gene_callers_id, source, annotation_list[0], annotation_list[1], 0, )) entry_id_counter += 1 genomes_db.insert_many(genome_gene_function_calls_table_name, entries=functions_entries) genomes_db.disconnect() progress.end() os.remove(db_path) run.info_single("Your genomes storage is now at version %s. The new on is at %s, and anvi'o just removed\ the old one, which was at %s from your disk." % (next_version, genome_storage_db_path, db_path), nl_after=1, nl_before=1, mc='green')