def create_empty_dataset(shape, h5_group, name='nDIM_Data'): """ returns a h5py.Dataset filled with zeros according to required shape list. Parameters ---------- shape: list List of integers denoting the shape of the main dataset h5_group: h5py.Group HDF5 group into which the datasets will be written into name: str, optional. Default: "nDIM_Data" Name of the main HDF5 dataset Returns ------- h5py.Dataset HDF5 dataset of desired shape written according to NSID format """ if not contains_integers(shape): raise ValueError('dimensions of shape need to be all integers') if not isinstance(h5_group, h5py.Group): raise TypeError('h5_group should be a h5py.Group object') return write_nsid_dataset(Dataset.from_array(np.zeros(shape)), h5_group, name)
def validate_dims_against_main(main_shape, dims, is_spectroscopic=True): """ Checks Dimension objects against a given shape for main datasets. Errors in parameters will result in Exceptions Parameters ---------- main_shape : array-like Tuple or list with the shape of the main data dims : iterable List of Dimension objects is_spectroscopic : bool, Optional. Default = True set to True if ``dims`` correspond to Spectroscopic Dimensions. False otherwise. """ if not isinstance(main_shape, (list, tuple)): raise TypeError('main_shape should be a list or tuple. Provided object' ' was of type: {}'.format(type(main_shape))) if len(main_shape) != 2: raise ValueError('"main_shape" should be of length 2') contains_integers(main_shape, min_val=1) if isinstance(dims, Dimension): dims = [dims] elif not isinstance(dims, (list, tuple)): raise TypeError('"dims" must be a list or tuple of usid.Dimension ' 'objects. Provided object was of type: {}' ''.format(type(dims))) if not all([isinstance(obj, Dimension) for obj in dims]): raise TypeError('One or more objects in "dims" was not usid.Dimension') if is_spectroscopic: main_dim = 1 dim_category = 'Spectroscopic' else: main_dim = 0 dim_category = 'Position' # TODO: This is where the dimension type will need to be taken into account lhs = main_shape[main_dim] rhs = np.product([len(x.values) for x in dims]) if lhs != rhs: raise ValueError(dim_category + ' dimensions in main data of size: {} do not match ' 'with product of values in provided Dimension objects' ': {}'.format(lhs, rhs))
def make_indices_matrix(num_steps, is_position=True): """ Makes an ancillary indices matrix given the number of steps in each dimension. In other words, this function builds a matrix whose rows correspond to unique combinations of the multiple dimensions provided. Parameters ------------ num_steps : List / numpy array Number of steps in each spatial or spectral dimension Note that the axes must be ordered from fastest varying to slowest varying is_position : bool, optional, default = True Whether the returned matrix is meant for position (True) indices (tall and skinny) or spectroscopic (False) indices (short and wide) Returns -------------- indices_matrix : 2D unsigned int numpy array arranged as [steps, spatial dimension] """ if not isinstance(num_steps, (tuple, list, np.ndarray)): raise TypeError('num_steps should be a list / tuple / numpy array') if not contains_integers(num_steps, min_val=1 + int(len(num_steps) > 0)): raise ValueError( 'num_steps should contain integers greater than equal to 1 (empty dimension) or 2' ) num_steps = np.array(num_steps) spat_dims = max(1, len(np.where(num_steps > 1)[0])) indices_matrix = np.zeros(shape=(np.prod(num_steps), spat_dims), dtype=INDICES_DTYPE) dim_ind = 0 for indx, curr_steps in enumerate(num_steps): if curr_steps > 1: part1 = np.prod(num_steps[:indx + 1]) if indx > 0: part2 = np.prod(num_steps[:indx]) else: part2 = 1 if indx + 1 == len(num_steps): part3 = 1 else: part3 = np.prod(num_steps[indx + 1:]) indices_matrix[:, dim_ind] = np.tile( np.floor(np.arange(part1) / part2), part3) dim_ind += 1 if not is_position: indices_matrix = indices_matrix.T return indices_matrix
def get_dimensionality(ds_index, index_sort=None): """ Get the size of each index dimension in a specified sort order Parameters ---------- ds_index : 2D HDF5 Dataset or numpy array Row matrix of indices index_sort : Iterable of unsigned integers (Optional) Sort that can be applied to dimensionality. For example - Order of rows sorted from fastest to slowest Returns ------- sorted_dims : list of unsigned integers Dimensionality of each row in ds_index. If index_sort is supplied, it will be in the sorted order """ if isinstance(ds_index, da.core.Array): ds_index = ds_index.compute() if not isinstance(ds_index, (np.ndarray, h5py.Dataset)): raise TypeError( 'ds_index should either be a numpy array or h5py.Dataset') if ds_index.shape[0] > ds_index.shape[1]: # must be spectroscopic like in shape (few rows, more cols) ds_index = np.transpose(ds_index) if index_sort is None: index_sort = np.arange(ds_index.shape[0]) else: if not contains_integers(index_sort, min_val=0): raise ValueError('index_sort should contain integers > 0') index_sort = np.array(index_sort) if index_sort.ndim != 1: raise ValueError('index_sort should be a 1D array') if len(np.unique(index_sort)) > ds_index.shape[0]: raise ValueError( 'length of index_sort ({}) should be smaller than number of dimensions in provided dataset' ' ({}'.format(len(np.unique(index_sort)), ds_index.shape[0])) if set(np.arange(ds_index.shape[0])) != set(index_sort): raise ValueError( 'Sort order of dimensions ({}) not matching with number of dimensions ({})' ''.format(index_sort, ds_index.shape[0])) sorted_dims = [ len(np.unique(row)) for row in np.array(ds_index, ndmin=2)[index_sort] ] return sorted_dims
def create_empty_dataset(shape, h5_group, name='nDIM_Data'): """ returns a NSID dataset filled with zeros according to required shape list. :param shape: list of integer :param h5_group: hdf5 group :param name: -optional- name of NSID dataset :return: NSID dataset """ if not contains_integers(shape): raise ValueError('dimensions of shape need to be all integers') if not isinstance(h5_group, h5py.Group): raise TypeError('h5_group should be a h5py.Group object') return write_nsid_dataset(Dataset.from_array(np.zeros(shape)), h5_group, name)
def validate_anc_h5_dsets(h5_inds, h5_vals, main_shape, is_spectroscopic=True): """ Checks ancillary HDF5 datasets against shape of a main dataset. Errors in parameters will result in Exceptions Parameters ---------- h5_inds : h5py.Dataset HDF5 dataset corresponding to the ancillary Indices dataset h5_vals : h5py.Dataset HDF5 dataset corresponding to the ancillary Values dataset main_shape : array-like Shape of the main dataset expressed as a tuple or similar is_spectroscopic : bool, Optional. Default = True set to True if ``dims`` correspond to Spectroscopic Dimensions. False otherwise. """ if not isinstance(h5_inds, h5py.Dataset): raise TypeError('h5_inds must be a h5py.Dataset object') if not isinstance(h5_vals, h5py.Dataset): raise TypeError('h5_vals must be a h5py.Dataset object') if h5_inds.shape != h5_vals.shape: raise ValueError('h5_inds: {} and h5_vals: {} should be of the same ' 'shape'.format(h5_inds.shape, h5_vals.shape)) if isinstance(main_shape, (list, tuple)): if not contains_integers(main_shape, min_val=1) or \ len(main_shape) != 2: raise ValueError("'main_shape' must be a valid HDF5 dataset shape") else: raise TypeError('main_shape should be of the following types:' 'h5py.Dataset, tuple, or list. {} provided' ''.format(type(main_shape))) if h5_inds.shape[is_spectroscopic] != main_shape[is_spectroscopic]: raise ValueError('index {} in shape of h5_inds: {} and main_data: {} ' 'should be equal'.format(int(is_spectroscopic), h5_inds.shape, main_shape))
def translate(self, image_path, h5_path=None, bin_factor=None, interp_func=Image.BICUBIC, normalize=False, **image_args): """ Translates the image in the provided file into a USID HDF5 file Parameters ---------------- image_path : str Absolute path to folder holding the image files h5_path : str, optional Absolute path to where the HDF5 file should be located. Default is None bin_factor : uint or array-like of uint, optional Down-sampling factor for each dimension. Default is None. If specifying different binning for each dimension, please specify as (height binning, width binning) interp_func : int, optional. Default = :attr:`PIL.Image.BICUBIC` How the image will be interpolated to provide the down-sampled or binned image. For more information see instructions for the `resample` argument for :meth:`PIL.Image.resize` normalize : boolean, optional. Default = False Should the raw image be normalized between the values of 0 and 1 image_args : dict Arguments to be passed to read_image. Arguments depend on the type of image. Returns ---------- h5_main : h5py.Dataset HDF5 Dataset object that contains the flattened images """ image_path, h5_path = self._parse_file_path(image_path, h5_path=h5_path) image = read_image(image_path, **image_args) image_parms = dict() usize, vsize = image.shape[:2] ''' Check if a bin_factor is given. Set up binning objects if it is. ''' if bin_factor is not None: if isinstance(bin_factor, (list, tuple)): if not contains_integers(bin_factor, min_val=1): raise TypeError( 'bin_factor should contain positive whole integers') if len(bin_factor) == 2: bin_factor = tuple(bin_factor) else: raise ValueError( 'Input parameter `bin_factor` must be a length 2 array-like or an integer.\n' + '{} was given.'.format(bin_factor)) elif isinstance(bin_factor, int): bin_factor = (bin_factor, bin_factor) else: raise TypeError( 'bin_factor should either be an integer or an iterable of positive integers' ) if np.min(bin_factor) < 0: raise ValueError('bin_factor must consist of positive factors') if interp_func not in [ Image.NEAREST, Image.BILINEAR, Image.BICUBIC, Image.LANCZOS ]: raise ValueError( "'interp_func' argument for ImageTranslator.translate must be one of " "PIL.Image.NEAREST, PIL.Image.BILINEAR, PIL.Image.BICUBIC, PIL.Image.LANCZOS" ) image_parms.update({ 'image_binning_size': bin_factor, 'image_PIL_resample_mode': interp_func }) usize = int(usize / bin_factor[0]) vsize = int(vsize / bin_factor[1]) # Unfortunately, we need to make a round-trip through PIL for the interpolation. Not possible with numpy img_obj = Image.fromarray(image) img_obj = img_obj.resize((vsize, usize), resample=interp_func) image = np.asarray(img_obj) # Working around occasional "cannot modify read-only array" error image = image.copy() ''' Normalize Raw Image ''' if normalize: image -= np.min(image) image = image / np.float32(np.max(image)) image_parms.update({ 'normalized': normalize, 'image_min': np.min(image), 'image_max': np.max(image) }) """ Enable the line below if there is a need make the image "look" the right side up. This would be manipulation # of the original data. Therefore it remains commented """ # image = np.flipud(image) ''' Ready to write to h5 ''' pos_dims = [ Dimension('Y', 'a.u.', np.arange(usize)), Dimension('X', 'a.u.', np.arange(vsize)) ] spec_dims = Dimension('arb', 'a.u.', 1) # Need to transpose to for correct reshaping image = image.transpose() h5_path = super(ImageTranslator, self).translate(h5_path, 'Raw_Data', image.reshape((-1, 1)), 'Intensity', 'a.u.', pos_dims, spec_dims, translator_name='ImageTranslator', parm_dict=image_parms) with h5py.File(h5_path, mode='r+') as h5_f: # For legacy reasons: write_simple_attrs(h5_f, {'data_type': 'ImageData'}) return h5_path
def read(self, bin_factor=None, interp_func=Image.BICUBIC, normalize=False, **image_args): """ Translates the image in the provided file into a USID HDF5 file Parameters ---------------- bin_factor : uint or array-like of uint, optional Down-sampling factor for each dimension. Default is None. If specifying different binning for each dimension, please specify as (height binning, width binning) interp_func : int, optional. Default = :attr:`PIL.Image.BICUBIC` How the image will be interpolated to provide the down-sampled or binned image. For more information see instructions for the `resample` argument for :meth:`PIL.Image.resize` normalize : boolean, optional. Default = False Should the raw image be normalized between the values of 0 and 1 image_args : dict Arguments to be passed to read_image. Arguments depend on the type of image. Returns ---------- """ image_path = self._parse_file_path(self._input_file_path) image = read_image(image_path, **image_args) image_parms = dict() usize, vsize = image.shape[:2] ''' Check if a bin_factor is given. Set up binning objects if it is. ''' if bin_factor is not None: if isinstance(bin_factor, (list, tuple)): if not contains_integers(bin_factor, min_val=1): raise TypeError( 'bin_factor should contain positive whole integers') if len(bin_factor) == 2: bin_factor = tuple(bin_factor) else: raise ValueError( 'Input parameter `bin_factor` must be a length 2 array-like or an integer.\n' + '{} was given.'.format(bin_factor)) elif isinstance(bin_factor, int): bin_factor = (bin_factor, bin_factor) else: raise TypeError( 'bin_factor should either be an integer or an iterable of positive integers' ) if np.min(bin_factor) < 0: raise ValueError('bin_factor must consist of positive factors') if interp_func not in [ Image.NEAREST, Image.BILINEAR, Image.BICUBIC, Image.LANCZOS ]: raise ValueError( "'interp_func' argument for ImageTranslator.translate must be one of " "PIL.Image.NEAREST, PIL.Image.BILINEAR, PIL.Image.BICUBIC, PIL.Image.LANCZOS" ) image_parms.update({ 'image_binning_size': bin_factor, 'image_PIL_resample_mode': interp_func }) usize = int(usize / bin_factor[0]) vsize = int(vsize / bin_factor[1]) # Unfortunately, we need to make a round-trip through PIL for the interpolation. Not possible with numpy img_obj = Image.fromarray(image) img_obj = img_obj.resize((vsize, usize), resample=interp_func) image = np.asarray(img_obj) # Working around occasional "cannot modify read-only array" error image = image.copy() ''' Normalize Raw Image ''' if normalize: image -= np.min(image) image = image / np.float32(np.max(image)) image_parms.update({ 'normalized': normalize, 'image_min': np.min(image), 'image_max': np.max(image) }) data_set = Dataset.from_array(image, name='random') data_set.data_type = 'image' data_set.units = 'a. u.' data_set.quantity = 'Intensity' data_set.set_dimension( 0, Dimension('y', np.arange(usize), units='a. u.', quantity='Length', dimension_type='spatial')) data_set.set_dimension( 1, Dimension('x', np.arange(vsize), units='a. u.', quantity='Length', dimension_type='spatial')) return data_set
def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, data_type, modality, source, dim_dict, main_dset_attrs=None, verbose=False, slow_to_fast=False, **kwargs): """ #TODO: Suhas to think about this a lot more Writes the provided data as a 'Main' dataset with all appropriate linking. By default, the instructions for generating dimension should be provided as a dictionary containing pyNSID-Dimensions or 1-Dim datasets The dimension-datasets can be shared with other main datasets; in this case, fresh datasets will not be generated. Parameters ---------- h5_parent_group : :class:`h5py.Group` Parent group under which the datasets will be created main_data : numpy.ndarray, dask.array.core.Array, list or tuple 2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset. If creating an empty dataset - the dtype must be specified via a kwarg. main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character. quantity : String / Unicode Name of the physical quantity stored in the dataset. Example - 'Current' units : String / Unicode Name of units for the quantity stored in the dataset. Example - 'A' for amperes data_type : `string : What kind of data this is. Example - image, image stack, video, hyperspectral image, etc. modality : `string : Experimental / simulation modality - scientific meaning of data. Example - photograph, TEM micrograph, SPM Force-Distance spectroscopy. source : `string : Source for dataset like the kind of instrument. dim_dict : Dictionary containing Dimension or h5PyDataset objects, that map each dimension to the specified dimension. E.g. {'0': position_X, '1': position_Y, 2: spectra} where position_X, position_Y, spectra can be either Dimensions or h5py datasets. Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Position indices and values datasets main_dset_attrs: dictionary, Optional, default = None flat dictionary of data to be added to the dataset, verbose : bool, Optional, default=False If set to true - prints debugging logs kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other arguments this way Returns ------- h5_main : NSIDataset Reference to the main dataset """ if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or h5py.Group object') if not is_editable_h5(h5_parent_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') ##################### # Validate Main Data ##################### quantity, units, main_data_name, data_type, modality, source = validate_string_args( [quantity, units, main_data_name, data_type, modality, source], [ 'quantity', 'units', 'main_data_name', 'data_type', 'modality', 'source' ]) if verbose: print('quantity, units, main_data_name all OK') quantity = quantity.strip() units = units.strip() main_data_name = main_data_name.strip() if '-' in main_data_name: warn( 'main_data_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') if isinstance(main_data, (list, tuple)): if not contains_integers(main_data, min_val=1): raise ValueError( 'main_data if specified as a shape should be a list / tuple of integers >= 1' ) if len(main_data) < 1: raise ValueError( 'main_data if specified as a shape should contain at least 1 number for the singular dimension' ) if 'dtype' not in kwargs: raise ValueError( 'dtype must be included as a kwarg when creating an empty dataset' ) _ = validate_dtype(kwargs.get('dtype')) main_shape = main_data if verbose: print('Selected empty dataset creation. OK so far') elif isinstance(main_data, (np.ndarray, da.core.Array)): main_shape = main_data.shape if verbose: print('Provided numpy or Dask array for main_data OK so far') else: raise TypeError( 'main_data should either be a numpy array or a tuple / list with the shape of the data' ) ###################### # Validate Dimensions ###################### # An N dimensional dataset should have N items in the dimension dictionary if len(dim_dict) != len(main_shape): raise ValueError( 'Incorrect number of dimensions: {} provided to support main data, of shape: {}' .format(len(dim_dict), main_shape)) if set(range(len(main_shape))) != set(dim_dict.keys()): raise KeyError('') if False in validate_main_dimensions(main_shape, dim_dict, h5_parent_group): print('Dimensions incorrect') return if verbose: print('Dimensions are correct!') ##################### # Write Main Dataset #################### if h5_parent_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn( 'This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed' ) if main_data_name in h5_parent_group: print('Oops, dataset exits') #del h5_parent_group[main_data_name] return if isinstance(main_data, np.ndarray): # Case 1 - simple small dataset h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs) if verbose: print('Created main dataset with provided data') elif isinstance(main_data, da.core.Array): # Case 2 - Dask dataset # step 0 - get rid of any automated dtype specification: _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}'. format(h5_main, main_data)) print( 'Dask array will be written to HDF5 dataset: "{}" in file: "{}"' .format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data}) # main_data.to_hdf5(h5_main.file.filename, h5_main.name) # Does not work with python 2 for some reason else: # Case 3 - large empty dataset h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs) if verbose: print('Created empty dataset for Main') ################# # Add Dimensions ################# dimensional_dict = {} for i, this_dim in dim_dict.items(): if isinstance(this_dim, h5py.Dataset): this_dim_dset = this_dim if 'nsid_version' not in this_dim_dset.attrs: this_dim_dset.attrs['nsid_version'] = '0.0.1' #this_dim_dset[i] = this_dim elif isinstance(this_dim, Dimension): this_dim_dset = h5_parent_group.create_dataset( this_dim.name, data=this_dim.values) attrs_to_write = { 'name': this_dim.name, 'units': this_dim.units, 'quantity': this_dim.quantity, 'dimension_type': this_dim.dimension_type, 'nsid_version': '0.0.1' } write_simple_attrs(this_dim_dset, attrs_to_write) else: print(i, ' not a good dimension') pass dimensional_dict[i] = this_dim_dset attrs_to_write = { 'quantity': quantity, 'units': units, 'nsid_version': '0.0.1' } attrs_to_write['main_data_name'] = main_data_name attrs_to_write['data_type'] = data_type attrs_to_write['modality'] = modality attrs_to_write['source'] = source write_simple_attrs(h5_main, attrs_to_write) if verbose: print('Wrote dimensions and attributes to main dataset') if isinstance(main_dset_attrs, dict): write_simple_attrs(h5_main, main_dset_attrs) if verbose: print('Wrote provided attributes to main dataset') #ToDo: check if we need write_book_keeping_attrs(h5_main) NSID_data_main = link_as_main(h5_main, dimensional_dict) if verbose: print('Successfully linked datasets - dataset should be main now') return NSID_data_main #NSIDataset(h5_main)
def write_main_dataset(h5_parent_group, main_data, main_data_name, quantity, units, pos_dims, spec_dims, main_dset_attrs=None, h5_pos_inds=None, h5_pos_vals=None, h5_spec_inds=None, h5_spec_vals=None, aux_spec_prefix='Spectroscopic_', aux_pos_prefix='Position_', verbose=False, slow_to_fast=False, **kwargs): """ Writes the provided data as a 'Main' dataset with all appropriate linking. By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this case, fresh datasets will not be generated. Parameters ---------- h5_parent_group : :class:`h5py.Group` Parent group under which the datasets will be created main_data : numpy.ndarray, dask.array.core.Array, list or tuple 2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset. If creating an empty dataset - the dtype must be specified via a kwarg. main_data_name : String / Unicode Name to give to the main dataset. This cannot contain the '-' character. quantity : String / Unicode Name of the physical quantity stored in the dataset. Example - 'Current' units : String / Unicode Name of units for the quantity stored in the dataset. Example - 'A' for amperes pos_dims : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Position indices and values datasets spec_dims : Dimension or array-like of Dimension objects Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values datasets Object specifying the instructions necessary for building the Spectroscopic indices and values datasets main_dset_attrs : dictionary, Optional Dictionary of parameters that will be written to the main dataset. Do NOT include region references here. h5_pos_inds : h5py.Dataset, Optional Dataset that will be linked with the name "Position_Indices" h5_pos_vals : h5py.Dataset, Optional Dataset that will be linked with the name "Position_Values" h5_spec_inds : h5py.Dataset, Optional Dataset that will be linked with the name "Spectroscopic_Indices" h5_spec_vals : h5py.Dataset, Optional Dataset that will be linked with the name "Spectroscopic_Values" aux_spec_prefix : str or unicode, Optional Default prefix for Spectroscopic datasets. Default = "Spectroscopic" aux_pos_prefix : str or unicode, Optional Default prefix for Position datasets. Default = "Position" verbose : bool, Optional, default=False If set to true - prints debugging logs slow_to_fast : bool, Optional. Default=False Set to True if the dimensions are arranged from slowest varying to fastest varying. Set to False otherwise. kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other arguments this way Returns ------- h5_main : USIDataset Reference to the main dataset """ def __check_anc_before_creation(aux_prefix, dim_type='pos'): aux_prefix = validate_single_string_arg(aux_prefix, 'aux_' + dim_type + '_prefix') if not aux_prefix.endswith('_'): aux_prefix += '_' if '-' in aux_prefix: warn( 'aux_' + dim_type + ' should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(aux_prefix, aux_prefix.replace('-', '_'))) aux_prefix = aux_prefix.replace('-', '_') for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']: if dset_name in h5_parent_group.keys(): # TODO: What if the contained data was correct? raise KeyError( 'Dataset named: ' + dset_name + ' already exists in group: ' '{}. Consider passing these datasets using kwargs (if they are correct) instead of providing the pos_dims and spec_dims arguments' .format(h5_parent_group.name)) return aux_prefix def __ensure_anc_in_correct_file(h5_inds, h5_vals, prefix): if h5_inds.file != h5_vals.file: raise ValueError('Provided ' + prefix + ' datasets are present in different HDF5 files!') if h5_inds.file != h5_parent_group.file: # Need to copy over the anc datasets to the new group if verbose: print('Need to copy over ancillary datasets: {} and {} to ' 'destination group: {} which is in a different HDF5 ' 'file'.format(h5_inds, h5_vals, h5_parent_group)) ret_vals = [ copy_dataset(x, h5_parent_group, verbose=verbose) for x in [h5_inds, h5_vals] ] else: ret_vals = [h5_inds, h5_vals] return tuple(ret_vals) if not isinstance(h5_parent_group, (h5py.Group, h5py.File)): raise TypeError( 'h5_parent_group should be a h5py.File or h5py.Group object') if not is_editable_h5(h5_parent_group): raise ValueError('The provided file is not editable') if verbose: print('h5 group and file OK') quantity, units, main_data_name = validate_string_args( [quantity, units, main_data_name], ['quantity', 'units', 'main_data_name']) if verbose: print('quantity, units, main_data_name all OK') quantity = quantity.strip() units = units.strip() main_data_name = main_data_name.strip() if '-' in main_data_name: warn( 'main_data_name should not contain the "-" character. Reformatted name from:{} to ' '{}'.format(main_data_name, main_data_name.replace('-', '_'))) main_data_name = main_data_name.replace('-', '_') if isinstance(main_data, (list, tuple)): if not contains_integers(main_data, min_val=1): raise ValueError( 'main_data if specified as a shape should be a list / tuple of integers >= 1' ) if len(main_data) != 2: raise ValueError( 'main_data if specified as a shape should contain 2 numbers') if 'dtype' not in kwargs: raise ValueError( 'dtype must be included as a kwarg when creating an empty dataset' ) _ = validate_dtype(kwargs.get('dtype')) main_shape = main_data if verbose: print('Selected empty dataset creation. OK so far') elif isinstance(main_data, (np.ndarray, da.core.Array)): if main_data.ndim != 2: raise ValueError('main_data should be a 2D array') main_shape = main_data.shape if verbose: print('Provided numpy or Dask array for main_data OK so far') else: raise TypeError( 'main_data should either be a numpy array or a tuple / list with the shape of the data' ) if h5_pos_inds is not None and h5_pos_vals is not None: # The provided datasets override fresh building instructions. validate_anc_h5_dsets(h5_pos_inds, h5_pos_vals, main_shape, is_spectroscopic=False) if verbose: print( 'The shapes of the provided h5 position indices and values are OK' ) h5_pos_inds, h5_pos_vals = __ensure_anc_in_correct_file( h5_pos_inds, h5_pos_vals, 'Position') else: aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix, dim_type='pos') pos_dims = validate_dimensions(pos_dims, dim_type='Position') validate_dims_against_main(main_shape, pos_dims, is_spectroscopic=False) if verbose: print('Passed all pre-tests for creating position datasets') h5_pos_inds, h5_pos_vals = write_ind_val_dsets( h5_parent_group, pos_dims, is_spectral=False, verbose=verbose, slow_to_fast=slow_to_fast, base_name=aux_pos_prefix) if verbose: print('Created position datasets!') if h5_spec_inds is not None and h5_spec_vals is not None: # The provided datasets override fresh building instructions. validate_anc_h5_dsets(h5_spec_inds, h5_spec_vals, main_shape, is_spectroscopic=True) if verbose: print('The shapes of the provided h5 position indices and values ' 'are OK') h5_spec_inds, h5_spec_vals = __ensure_anc_in_correct_file( h5_spec_inds, h5_spec_vals, 'Spectroscopic') else: aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix, dim_type='spec') spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic') validate_dims_against_main(main_shape, spec_dims, is_spectroscopic=True) if verbose: print('Passed all pre-tests for creating spectroscopic datasets') h5_spec_inds, h5_spec_vals = write_ind_val_dsets( h5_parent_group, spec_dims, is_spectral=True, verbose=verbose, slow_to_fast=slow_to_fast, base_name=aux_spec_prefix) if verbose: print('Created Spectroscopic datasets') if h5_parent_group.file.driver == 'mpio': if kwargs.pop('compression', None) is not None: warn( 'This HDF5 file has been opened wth the "mpio" communicator. ' 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed' ) if isinstance(main_data, np.ndarray): # Case 1 - simple small dataset h5_main = h5_parent_group.create_dataset(main_data_name, data=main_data, **kwargs) if verbose: print('Created main dataset with provided data') elif isinstance(main_data, da.core.Array): # Case 2 - Dask dataset # step 0 - get rid of any automated dtype specification: _ = kwargs.pop('dtype', None) # step 1 - create the empty dataset: h5_main = h5_parent_group.create_dataset(main_data_name, shape=main_data.shape, dtype=main_data.dtype, **kwargs) if verbose: print('Created empty dataset: {} for writing Dask dataset: {}'. format(h5_main, main_data)) print( 'Dask array will be written to HDF5 dataset: "{}" in file: "{}"' .format(h5_main.name, h5_main.file.filename)) # Step 2 - now ask Dask to dump data to disk da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data}) # main_data.to_hdf5(h5_main.file.filename, h5_main.name) # Does not work with python 2 for some reason else: # Case 3 - large empty dataset h5_main = h5_parent_group.create_dataset(main_data_name, main_data, **kwargs) if verbose: print('Created empty dataset for Main') write_simple_attrs(h5_main, {'quantity': quantity, 'units': units}) if verbose: print('Wrote quantity and units attributes to main dataset') if isinstance(main_dset_attrs, dict): write_simple_attrs(h5_main, main_dset_attrs) if verbose: print('Wrote provided attributes to main dataset') write_book_keeping_attrs(h5_main) # make it main link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals) if verbose: print('Successfully linked datasets - dataset should be main now') from ..usi_data import USIDataset return USIDataset(h5_main)