def get(self, split_name, sample_names=[]): self.is_known_split(split_name) if sample_names: if not isinstance(sample_names, set): raise HDF5Error('The type of sample names must be a "set".') # let's learn what we have sample_names_in_db = list(self.fp['/data/coverages/%s' % split_name].keys()) if sample_names: for sample_name in sample_names: missing_samples = [ sample_name for sample_name in sample_names if sample_name not in sample_names_in_db ] if len(missing_samples): raise HDF5Error( "Some sample names you requested are missing from the auxiliary data file. Here\ they are: '%s'" % (', '.join(missing_samples))) else: sample_names = list(self.fp['/data/coverages/%s' % split_name].keys()) d = {} for sample_name in sample_names: d[sample_name] = self.get_integer_list('/data/coverages/%s/%s' % (split_name, sample_name)) return d
def __init__(self, file_path, unique_hash, create_new=False, open_in_append_mode=False, ignore_hash=False): self.file_path = file_path if open_in_append_mode and not create_new: raise HDF5Error( "The 'open_in_append_mode' flag can only be used along with the flag 'create_new'." ) if create_new: if ignore_hash: raise HDF5Error( "When creating (or appending to) a database, you can't use the 'ignore_hash'\ flag.") if not unique_hash: raise HDF5Error( "When creating (or appending to) a database, the 'unique_hash' cannot be None." ) self.fp = h5py.File(self.file_path, 'a' if open_in_append_mode else 'w') self.fp.attrs['hash'] = unique_hash self.fp.attrs['version'] = self.version else: filesnpaths.is_file_exists(self.file_path) self.fp = h5py.File(self.file_path, 'r') G = lambda x: self.fp.attrs[x].decode('utf-8') if isinstance( self.fp.attrs[x], bytes) else self.fp.attrs[x] fp_version = G('version') fp_hash = G('hash') if fp_version != self.version: raise HDF5Error( "The data file for %s ('%s') is at version '%s', however, your client is at\ version '%s'. This is bad news, because your version of anvi'o can't work with\ this file. You can regenerate the data file using the current version of anvi'o,\ or look around to see whether there is an upgrade script is available (a good start\ would be to type 'anvi-script-upgrade-' and then click TAB key twice). Otherwise you\ may want to consider sending an e-mail to the anvi'o developers to find out what's up.\ We heard that they love them some e-mails." % (self.db_type, self.file_path, self.fp.attrs['version'].decode('utf-8'), self.version)) if not ignore_hash and fp_hash != unique_hash: raise HDF5Error( "The database at '%s' does not seem to be compatible with the client :/\ (i.e., the hash values do not match)." % self.file_path) self.unique_hash = fp_hash
def check_sample_names(self, sample_names, split_name=None): if sample_names: if not isinstance(sample_names, set): raise HDF5Error('The type of sample names must be a "set".') if sample_names: for sample_name in sample_names: missing_samples = [sample_name for sample_name in sample_names if sample_name not in self.sample_names_in_db] if len(missing_samples): raise HDF5Error("Some sample names you requested are missing from the auxiliary data file. Here\ they are: '%s'" % (', '.join(missing_samples))) return sample_name return self.sample_names_in_db
def is_known_genome(self, genome_name, throw_exception=True): if not self.path_exists('/info/genomes/%s' % genome_name): if throw_exception: raise HDF5Error( 'The database at "%s" does not know anything about "%s" :(' % (self.file_path, genome_name)) else: return False
def add_genome(self, genome_name, info_dict): if self.is_known_genome(genome_name, throw_exception=False): raise HDF5Error("Genome '%s' is already in this data storage :/" % genome_name) for key in self.essential_genome_info: self.fp['/info/genomes/%s/%s' % (genome_name, key)] = info_dict[key]
def add_genome(self, genome_name, info_dict): if self.is_known_genome(genome_name, throw_exception=False): raise HDF5Error("Genome '%s' is already in this data storage :/" % genome_name) for key in self.essential_genome_info: # the following line will add a -1 for any `key` that has the value of `None`. the reason # we added this was to be able to work with contigs databases without any hmm hits for SCGs # which is covered in https://github.com/merenlab/anvio/issues/573 self.fp['/info/genomes/%s/%s' % (genome_name, key)] = info_dict[key] if info_dict[key] is not None else -1
def get_gene_functions(self, genome_name, gene_caller_id): if not self.functions_are_available: raise HDF5Error("Functions are not available for this genome storage, and you are calling GenomesStorage::get_gene_functions\ when you really shouldn't :/") functions = {} if 'functions' not in self.fp['/data/genomes/%s/%d' % (genome_name, gene_caller_id)]: # no sources provided any annotation for this poor gene return functions d = self.fp['/data/genomes/%s/%d/functions' % (genome_name, gene_caller_id)] for source in d: functions[source] = d[source].value return functions
def __init__(self, file_path, db_hash, genome_names_to_focus=None, create_new=False, ignore_hash=False, run=run, progress=progress, quiet=False): self.db_type = 'genomes data storage' self.version = anvio.__genomes_storage_version__ self.genome_names_to_focus = genome_names_to_focus HDF5_IO.__init__(self, file_path, db_hash, create_new = create_new, ignore_hash = ignore_hash) self.run = run self.progress = progress self.quiet = quiet self.essential_genome_info = constants.essential_genome_info + ['genome_hash', 'external_genome'] self.D = lambda genome_name: self.fp['/data/genomes/%s' % genome_name] self.G = lambda gene_callers_id, genome_data: genome_data['%d' % gene_callers_id] if not create_new: self.genome_names = self.get_genome_names_in_db() if self.genome_names_to_focus: genome_names_to_focus_missing_from_db = [g for g in self.genome_names_to_focus if g not in self.genome_names] # make sure the user knows what they're doing if genome_names_to_focus_missing_from_db: raise HDF5Error("%d of %d genome names you wanted to focus are missing from the genomes sotrage.\ Although this may not be a show-stopper, anvi'o likes to be explicit, so here we\ are. Not going anywhere until you fix this. For instance this is one of the missing\ genome names: '%s', and this is one random genome name from the database: '%s'" % \ (len(genome_names_to_focus_missing_from_db), len(self.genome_names_to_focus),\ genome_names_to_focus_missing_from_db[0], list(self.genomes.keys())[0])) self.genome_names = self.genome_names_to_focus self.num_genomes = len(self.genome_names) self.functions_are_available = self.fp.attrs['functions_are_available'] self.run.info('Genomes storage', 'Initialized (storage hash: %s)' % (self.unique_hash)) self.run.info('Num genomes in storage', len(self.get_genome_names_in_db())) self.run.info('Num genomes will be used', len(self.genome_names), mc='green')
def is_known_gene_call(self, genome_name, gene_caller_id): if not self.path_exists('/data/genomes/%s/%d' % (genome_name, gene_caller_id)): raise HDF5Error('The genome "%s" does not know anything about the gene caller id "%d" :(' % (genome_name, gene_caller_id))
def is_known_split(self, split_name): if not self.path_exists('/data/coverages/%s' % split_name): raise HDF5Error('The database at "%s" does not know anything about "%s" :(' % (self.file_path, split_name))