Ejemplo n.º 1
0
def create_empty_dataset(shape, h5_group, name='nDIM_Data'):
    """
    returns a h5py.Dataset filled with zeros according to required shape list.

    Parameters
    ----------
    shape: list
        List of integers denoting the shape of the main dataset
    h5_group: h5py.Group
        HDF5 group into which the datasets will be written into
    name: str, optional. Default: "nDIM_Data"
        Name of the main HDF5 dataset

    Returns
    -------
    h5py.Dataset
        HDF5 dataset of desired shape written according to NSID format
    """
    if not contains_integers(shape):
        raise ValueError('dimensions of shape need to be all integers')
    if not isinstance(h5_group, h5py.Group):
        raise TypeError('h5_group should be a h5py.Group object')

    return write_nsid_dataset(Dataset.from_array(np.zeros(shape)), h5_group,
                              name)
Ejemplo n.º 2
0
def validate_dims_against_main(main_shape, dims, is_spectroscopic=True):
    """
    Checks Dimension objects against a given shape for main datasets.
    Errors in parameters will result in Exceptions

    Parameters
    ----------
    main_shape : array-like
        Tuple or list with the shape of the main data
    dims : iterable
        List of Dimension objects
    is_spectroscopic : bool, Optional. Default = True
        set to True if ``dims`` correspond to Spectroscopic Dimensions.
        False otherwise.
    """
    if not isinstance(main_shape, (list, tuple)):
        raise TypeError('main_shape should be a list or tuple. Provided object'
                        ' was of type: {}'.format(type(main_shape)))
    if len(main_shape) != 2:
        raise ValueError('"main_shape" should be of length 2')
    contains_integers(main_shape, min_val=1)

    if isinstance(dims, Dimension):
        dims = [dims]
    elif not isinstance(dims, (list, tuple)):
        raise TypeError('"dims" must be a list or tuple of usid.Dimension '
                        'objects. Provided object was of type: {}'
                        ''.format(type(dims)))
    if not all([isinstance(obj, Dimension) for obj in dims]):
        raise TypeError('One or more objects in "dims" was not usid.Dimension')

    if is_spectroscopic:
        main_dim = 1
        dim_category = 'Spectroscopic'
    else:
        main_dim = 0
        dim_category = 'Position'

    # TODO: This is where the dimension type will need to be taken into account
    lhs = main_shape[main_dim]
    rhs = np.product([len(x.values) for x in dims])
    if lhs != rhs:
        raise ValueError(dim_category +
                         ' dimensions in main data of size: {} do not match '
                         'with product of values in provided Dimension objects'
                         ': {}'.format(lhs, rhs))
Ejemplo n.º 3
0
def make_indices_matrix(num_steps, is_position=True):
    """
    Makes an ancillary indices matrix given the number of steps in each dimension. In other words, this function builds
    a matrix whose rows correspond to unique combinations of the multiple dimensions provided.

    Parameters
    ------------
    num_steps : List / numpy array
        Number of steps in each spatial or spectral dimension
        Note that the axes must be ordered from fastest varying to slowest varying
    is_position : bool, optional, default = True
        Whether the returned matrix is meant for position (True) indices (tall and skinny) or spectroscopic (False)
        indices (short and wide)

    Returns
    --------------
    indices_matrix : 2D unsigned int numpy array
        arranged as [steps, spatial dimension]
    """
    if not isinstance(num_steps, (tuple, list, np.ndarray)):
        raise TypeError('num_steps should be a list / tuple / numpy array')
    if not contains_integers(num_steps, min_val=1 + int(len(num_steps) > 0)):
        raise ValueError(
            'num_steps should contain integers greater than equal to 1 (empty dimension) or 2'
        )

    num_steps = np.array(num_steps)
    spat_dims = max(1, len(np.where(num_steps > 1)[0]))

    indices_matrix = np.zeros(shape=(np.prod(num_steps), spat_dims),
                              dtype=INDICES_DTYPE)
    dim_ind = 0

    for indx, curr_steps in enumerate(num_steps):
        if curr_steps > 1:

            part1 = np.prod(num_steps[:indx + 1])

            if indx > 0:
                part2 = np.prod(num_steps[:indx])
            else:
                part2 = 1

            if indx + 1 == len(num_steps):
                part3 = 1
            else:
                part3 = np.prod(num_steps[indx + 1:])

            indices_matrix[:, dim_ind] = np.tile(
                np.floor(np.arange(part1) / part2), part3)
            dim_ind += 1

    if not is_position:
        indices_matrix = indices_matrix.T

    return indices_matrix
Ejemplo n.º 4
0
def get_dimensionality(ds_index, index_sort=None):
    """
    Get the size of each index dimension in a specified sort order

    Parameters
    ----------
    ds_index : 2D HDF5 Dataset or numpy array
        Row matrix of indices
    index_sort : Iterable of unsigned integers (Optional)
        Sort that can be applied to dimensionality.
        For example - Order of rows sorted from fastest to slowest

    Returns
    -------
    sorted_dims : list of unsigned integers
        Dimensionality of each row in ds_index.  If index_sort is supplied, it will be in the sorted order

    """
    if isinstance(ds_index, da.core.Array):
        ds_index = ds_index.compute()
    if not isinstance(ds_index, (np.ndarray, h5py.Dataset)):
        raise TypeError(
            'ds_index should either be a numpy array or h5py.Dataset')

    if ds_index.shape[0] > ds_index.shape[1]:
        # must be spectroscopic like in shape (few rows, more cols)
        ds_index = np.transpose(ds_index)

    if index_sort is None:
        index_sort = np.arange(ds_index.shape[0])
    else:
        if not contains_integers(index_sort, min_val=0):
            raise ValueError('index_sort should contain integers > 0')
        index_sort = np.array(index_sort)
        if index_sort.ndim != 1:
            raise ValueError('index_sort should be a 1D array')
        if len(np.unique(index_sort)) > ds_index.shape[0]:
            raise ValueError(
                'length of index_sort ({}) should be smaller than number of dimensions in provided dataset'
                ' ({}'.format(len(np.unique(index_sort)), ds_index.shape[0]))
        if set(np.arange(ds_index.shape[0])) != set(index_sort):
            raise ValueError(
                'Sort order of dimensions ({}) not  matching  with number of dimensions ({})'
                ''.format(index_sort, ds_index.shape[0]))

    sorted_dims = [
        len(np.unique(row)) for row in np.array(ds_index, ndmin=2)[index_sort]
    ]
    return sorted_dims
Ejemplo n.º 5
0
def create_empty_dataset(shape, h5_group, name='nDIM_Data'):
    """
    returns a NSID dataset filled with zeros according to required shape list.

    :param shape: list of integer
    :param h5_group: hdf5 group
    :param name: -optional- name of NSID dataset

    :return:
    NSID dataset
    """
    if not contains_integers(shape):
        raise ValueError('dimensions of shape need to be all integers')
    if not isinstance(h5_group, h5py.Group):
        raise TypeError('h5_group should be a h5py.Group object')

    return write_nsid_dataset(Dataset.from_array(np.zeros(shape)), h5_group,
                              name)
Ejemplo n.º 6
0
def validate_anc_h5_dsets(h5_inds, h5_vals, main_shape, is_spectroscopic=True):
    """
    Checks ancillary HDF5 datasets against shape of a main dataset.
    Errors in parameters will result in Exceptions

    Parameters
    ----------
    h5_inds : h5py.Dataset
        HDF5 dataset corresponding to the ancillary Indices dataset
    h5_vals : h5py.Dataset
        HDF5 dataset corresponding to the ancillary Values dataset
    main_shape : array-like
        Shape of the main dataset expressed as a tuple or similar
    is_spectroscopic : bool, Optional. Default = True
        set to True if ``dims`` correspond to Spectroscopic Dimensions.
        False otherwise.
    """
    if not isinstance(h5_inds, h5py.Dataset):
        raise TypeError('h5_inds must be a h5py.Dataset object')
    if not isinstance(h5_vals, h5py.Dataset):
        raise TypeError('h5_vals must be a h5py.Dataset object')
    if h5_inds.shape != h5_vals.shape:
        raise ValueError('h5_inds: {} and h5_vals: {} should be of the same '
                         'shape'.format(h5_inds.shape, h5_vals.shape))
    if isinstance(main_shape, (list, tuple)):
        if not contains_integers(main_shape, min_val=1) or \
                len(main_shape) != 2:
            raise ValueError("'main_shape' must be a valid HDF5 dataset shape")
    else:
        raise TypeError('main_shape should be of the following types:'
                        'h5py.Dataset, tuple, or list. {} provided'
                        ''.format(type(main_shape)))

    if h5_inds.shape[is_spectroscopic] != main_shape[is_spectroscopic]:
        raise ValueError('index {} in shape of h5_inds: {} and main_data: {} '
                         'should be equal'.format(int(is_spectroscopic),
                                                  h5_inds.shape, main_shape))
Ejemplo n.º 7
0
    def translate(self,
                  image_path,
                  h5_path=None,
                  bin_factor=None,
                  interp_func=Image.BICUBIC,
                  normalize=False,
                  **image_args):
        """
        Translates the image in the provided file into a USID HDF5 file

        Parameters
        ----------------
        image_path : str
            Absolute path to folder holding the image files
        h5_path : str, optional
            Absolute path to where the HDF5 file should be located.
            Default is None
        bin_factor : uint or array-like of uint, optional
            Down-sampling factor for each dimension.  Default is None.
            If specifying different binning for each dimension, please specify as (height binning, width binning)
        interp_func : int, optional. Default = :attr:`PIL.Image.BICUBIC`
            How the image will be interpolated to provide the down-sampled or binned image.
            For more information see instructions for the `resample` argument for :meth:`PIL.Image.resize`
        normalize : boolean, optional. Default = False
            Should the raw image be normalized between the values of 0 and 1
        image_args : dict
            Arguments to be passed to read_image.  Arguments depend on the type of image.

        Returns
        ----------
        h5_main : h5py.Dataset
            HDF5 Dataset object that contains the flattened images

        """
        image_path, h5_path = self._parse_file_path(image_path,
                                                    h5_path=h5_path)

        image = read_image(image_path, **image_args)
        image_parms = dict()
        usize, vsize = image.shape[:2]
        '''
        Check if a bin_factor is given.  Set up binning objects if it is.
        '''
        if bin_factor is not None:
            if isinstance(bin_factor, (list, tuple)):
                if not contains_integers(bin_factor, min_val=1):
                    raise TypeError(
                        'bin_factor should contain positive whole integers')
                if len(bin_factor) == 2:
                    bin_factor = tuple(bin_factor)
                else:
                    raise ValueError(
                        'Input parameter `bin_factor` must be a length 2 array-like or an integer.\n'
                        + '{} was given.'.format(bin_factor))

            elif isinstance(bin_factor, int):
                bin_factor = (bin_factor, bin_factor)
            else:
                raise TypeError(
                    'bin_factor should either be an integer or an iterable of positive integers'
                )

            if np.min(bin_factor) < 0:
                raise ValueError('bin_factor must consist of positive factors')

            if interp_func not in [
                    Image.NEAREST, Image.BILINEAR, Image.BICUBIC, Image.LANCZOS
            ]:
                raise ValueError(
                    "'interp_func' argument for ImageTranslator.translate must be one of "
                    "PIL.Image.NEAREST, PIL.Image.BILINEAR, PIL.Image.BICUBIC, PIL.Image.LANCZOS"
                )

            image_parms.update({
                'image_binning_size': bin_factor,
                'image_PIL_resample_mode': interp_func
            })
            usize = int(usize / bin_factor[0])
            vsize = int(vsize / bin_factor[1])

            # Unfortunately, we need to make a round-trip through PIL for the interpolation. Not possible with numpy
            img_obj = Image.fromarray(image)
            img_obj = img_obj.resize((vsize, usize), resample=interp_func)
            image = np.asarray(img_obj)

        # Working around occasional "cannot modify read-only array" error
        image = image.copy()
        '''
        Normalize Raw Image
        '''
        if normalize:
            image -= np.min(image)
            image = image / np.float32(np.max(image))

        image_parms.update({
            'normalized': normalize,
            'image_min': np.min(image),
            'image_max': np.max(image)
        })
        """
        Enable the line below if there is a need make the image "look" the right side up. This would be manipulation
        # of the original data. Therefore it remains commented
        """
        # image = np.flipud(image)
        '''
        Ready to write to h5
        '''

        pos_dims = [
            Dimension('Y', 'a.u.', np.arange(usize)),
            Dimension('X', 'a.u.', np.arange(vsize))
        ]
        spec_dims = Dimension('arb', 'a.u.', 1)

        # Need to transpose to for correct reshaping
        image = image.transpose()

        h5_path = super(ImageTranslator,
                        self).translate(h5_path,
                                        'Raw_Data',
                                        image.reshape((-1, 1)),
                                        'Intensity',
                                        'a.u.',
                                        pos_dims,
                                        spec_dims,
                                        translator_name='ImageTranslator',
                                        parm_dict=image_parms)

        with h5py.File(h5_path, mode='r+') as h5_f:

            # For legacy reasons:
            write_simple_attrs(h5_f, {'data_type': 'ImageData'})

        return h5_path
Ejemplo n.º 8
0
    def read(self,
             bin_factor=None,
             interp_func=Image.BICUBIC,
             normalize=False,
             **image_args):
        """
        Translates the image in the provided file into a USID HDF5 file

        Parameters
        ----------------
        bin_factor : uint or array-like of uint, optional
            Down-sampling factor for each dimension.  Default is None.
            If specifying different binning for each dimension, please specify as (height binning, width binning)
        interp_func : int, optional. Default = :attr:`PIL.Image.BICUBIC`
            How the image will be interpolated to provide the down-sampled or binned image.
            For more information see instructions for the `resample` argument for :meth:`PIL.Image.resize`
        normalize : boolean, optional. Default = False
            Should the raw image be normalized between the values of 0 and 1
        image_args : dict
            Arguments to be passed to read_image.  Arguments depend on the type of image.

        Returns
        ----------

        """
        image_path = self._parse_file_path(self._input_file_path)

        image = read_image(image_path, **image_args)
        image_parms = dict()
        usize, vsize = image.shape[:2]
        '''
        Check if a bin_factor is given.  Set up binning objects if it is.
        '''
        if bin_factor is not None:
            if isinstance(bin_factor, (list, tuple)):
                if not contains_integers(bin_factor, min_val=1):
                    raise TypeError(
                        'bin_factor should contain positive whole integers')
                if len(bin_factor) == 2:
                    bin_factor = tuple(bin_factor)
                else:
                    raise ValueError(
                        'Input parameter `bin_factor` must be a length 2 array-like or an integer.\n'
                        + '{} was given.'.format(bin_factor))

            elif isinstance(bin_factor, int):
                bin_factor = (bin_factor, bin_factor)
            else:
                raise TypeError(
                    'bin_factor should either be an integer or an iterable of positive integers'
                )

            if np.min(bin_factor) < 0:
                raise ValueError('bin_factor must consist of positive factors')

            if interp_func not in [
                    Image.NEAREST, Image.BILINEAR, Image.BICUBIC, Image.LANCZOS
            ]:
                raise ValueError(
                    "'interp_func' argument for ImageTranslator.translate must be one of "
                    "PIL.Image.NEAREST, PIL.Image.BILINEAR, PIL.Image.BICUBIC, PIL.Image.LANCZOS"
                )

            image_parms.update({
                'image_binning_size': bin_factor,
                'image_PIL_resample_mode': interp_func
            })
            usize = int(usize / bin_factor[0])
            vsize = int(vsize / bin_factor[1])

            # Unfortunately, we need to make a round-trip through PIL for the interpolation. Not possible with numpy
            img_obj = Image.fromarray(image)
            img_obj = img_obj.resize((vsize, usize), resample=interp_func)
            image = np.asarray(img_obj)

        # Working around occasional "cannot modify read-only array" error
        image = image.copy()
        '''
        Normalize Raw Image
        '''
        if normalize:
            image -= np.min(image)
            image = image / np.float32(np.max(image))

        image_parms.update({
            'normalized': normalize,
            'image_min': np.min(image),
            'image_max': np.max(image)
        })

        data_set = Dataset.from_array(image, name='random')

        data_set.data_type = 'image'
        data_set.units = 'a. u.'
        data_set.quantity = 'Intensity'

        data_set.set_dimension(
            0,
            Dimension('y',
                      np.arange(usize),
                      units='a. u.',
                      quantity='Length',
                      dimension_type='spatial'))
        data_set.set_dimension(
            1,
            Dimension('x',
                      np.arange(vsize),
                      units='a. u.',
                      quantity='Length',
                      dimension_type='spatial'))

        return data_set
Ejemplo n.º 9
0
def write_main_dataset(h5_parent_group,
                       main_data,
                       main_data_name,
                       quantity,
                       units,
                       data_type,
                       modality,
                       source,
                       dim_dict,
                       main_dset_attrs=None,
                       verbose=False,
                       slow_to_fast=False,
                       **kwargs):
    """

    #TODO: Suhas to think about this a lot more

    Writes the provided data as a 'Main' dataset with all appropriate linking.
    By default, the instructions for generating dimension should be provided as a dictionary containing pyNSID-Dimensions or 1-Dim datasets 
    The dimension-datasets can be shared with other main datasets; in this case, fresh datasets will not be generated.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data : numpy.ndarray, dask.array.core.Array, list or tuple
        2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset.
        If creating an empty dataset - the dtype must be specified via a kwarg.
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character.
    quantity : String / Unicode
        Name of the physical quantity stored in the dataset. Example - 'Current'
    units : String / Unicode
        Name of units for the quantity stored in the dataset. Example - 'A' for amperes
    data_type : `string : What kind of data this is. Example - image, image stack, video, hyperspectral image, etc.
    modality : `string : Experimental / simulation modality - scientific meaning of data. Example - photograph, TEM micrograph, SPM Force-Distance spectroscopy.
    source : `string : Source for dataset like the kind of instrument.
    dim_dict : Dictionary containing Dimension or h5PyDataset objects, that map each dimension to the specified dimension. E.g.
        {'0': position_X, '1': position_Y, 2: spectra} where position_X, position_Y, spectra can be either Dimensions or h5py datasets.

        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Position indices and values datasets
    main_dset_attrs: dictionary, Optional, default = None
        flat dictionary of data to be added to the dataset, 
    verbose : bool, Optional, default=False
        If set to true - prints debugging logs
    kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
        arguments this way

    Returns
    -------
    h5_main : NSIDataset
        Reference to the main dataset

    """

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    #####################
    # Validate Main Data
    #####################
    quantity, units, main_data_name, data_type, modality, source = validate_string_args(
        [quantity, units, main_data_name, data_type, modality, source], [
            'quantity', 'units', 'main_data_name', 'data_type', 'modality',
            'source'
        ])

    if verbose:
        print('quantity, units, main_data_name all OK')

    quantity = quantity.strip()
    units = units.strip()
    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn(
            'main_data_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    if isinstance(main_data, (list, tuple)):
        if not contains_integers(main_data, min_val=1):
            raise ValueError(
                'main_data if specified as a shape should be a list / tuple of integers >= 1'
            )
        if len(main_data) < 1:
            raise ValueError(
                'main_data if specified as a shape should contain at least 1 number for the singular dimension'
            )
        if 'dtype' not in kwargs:
            raise ValueError(
                'dtype must be included as a kwarg when creating an empty dataset'
            )
        _ = validate_dtype(kwargs.get('dtype'))
        main_shape = main_data
        if verbose:
            print('Selected empty dataset creation. OK so far')
    elif isinstance(main_data, (np.ndarray, da.core.Array)):
        main_shape = main_data.shape
        if verbose:
            print('Provided numpy or Dask array for main_data OK so far')
    else:
        raise TypeError(
            'main_data should either be a numpy array or a tuple / list with the shape of the data'
        )

    ######################
    # Validate Dimensions
    ######################
    # An N dimensional dataset should have N items in the dimension dictionary
    if len(dim_dict) != len(main_shape):
        raise ValueError(
            'Incorrect number of dimensions: {} provided to support main data, of shape: {}'
            .format(len(dim_dict), main_shape))
    if set(range(len(main_shape))) != set(dim_dict.keys()):
        raise KeyError('')

    if False in validate_main_dimensions(main_shape, dim_dict,
                                         h5_parent_group):
        print('Dimensions incorrect')
        return
    if verbose:
        print('Dimensions are correct!')

    #####################
    # Write Main Dataset
    ####################
    if h5_parent_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn(
                'This HDF5 file has been opened wth the "mpio" communicator. '
                'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed'
            )

    if main_data_name in h5_parent_group:
        print('Oops, dataset exits')
        #del h5_parent_group[main_data_name]
        return

    if isinstance(main_data, np.ndarray):
        # Case 1 - simple small dataset
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 data=main_data,
                                                 **kwargs)
        if verbose:
            print('Created main dataset with provided data')
    elif isinstance(main_data, da.core.Array):
        # Case 2 - Dask dataset
        # step 0 - get rid of any automated dtype specification:
        _ = kwargs.pop('dtype', None)
        # step 1 - create the empty dataset:
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 shape=main_data.shape,
                                                 dtype=main_data.dtype,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset: {} for writing Dask dataset: {}'.
                  format(h5_main, main_data))
            print(
                'Dask array will be written to HDF5 dataset: "{}" in file: "{}"'
                .format(h5_main.name, h5_main.file.filename))
        # Step 2 - now ask Dask to dump data to disk
        da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
        # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
    else:
        # Case 3 - large empty dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, main_data,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset for Main')

    #################
    # Add Dimensions
    #################
    dimensional_dict = {}
    for i, this_dim in dim_dict.items():
        if isinstance(this_dim, h5py.Dataset):
            this_dim_dset = this_dim
            if 'nsid_version' not in this_dim_dset.attrs:
                this_dim_dset.attrs['nsid_version'] = '0.0.1'
            #this_dim_dset[i] = this_dim
        elif isinstance(this_dim, Dimension):
            this_dim_dset = h5_parent_group.create_dataset(
                this_dim.name, data=this_dim.values)
            attrs_to_write = {
                'name': this_dim.name,
                'units': this_dim.units,
                'quantity': this_dim.quantity,
                'dimension_type': this_dim.dimension_type,
                'nsid_version': '0.0.1'
            }
            write_simple_attrs(this_dim_dset, attrs_to_write)

        else:
            print(i, ' not a good dimension')
            pass
        dimensional_dict[i] = this_dim_dset

    attrs_to_write = {
        'quantity': quantity,
        'units': units,
        'nsid_version': '0.0.1'
    }
    attrs_to_write['main_data_name'] = main_data_name
    attrs_to_write['data_type'] = data_type
    attrs_to_write['modality'] = modality
    attrs_to_write['source'] = source

    write_simple_attrs(h5_main, attrs_to_write)

    if verbose:
        print('Wrote dimensions and attributes to main dataset')

    if isinstance(main_dset_attrs, dict):
        write_simple_attrs(h5_main, main_dset_attrs)
        if verbose:
            print('Wrote provided attributes to main dataset')

    #ToDo: check if we need  write_book_keeping_attrs(h5_main)
    NSID_data_main = link_as_main(h5_main, dimensional_dict)
    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    return NSID_data_main  #NSIDataset(h5_main)
Ejemplo n.º 10
0
def write_main_dataset(h5_parent_group,
                       main_data,
                       main_data_name,
                       quantity,
                       units,
                       pos_dims,
                       spec_dims,
                       main_dset_attrs=None,
                       h5_pos_inds=None,
                       h5_pos_vals=None,
                       h5_spec_inds=None,
                       h5_spec_vals=None,
                       aux_spec_prefix='Spectroscopic_',
                       aux_pos_prefix='Position_',
                       verbose=False,
                       slow_to_fast=False,
                       **kwargs):
    """
    Writes the provided data as a 'Main' dataset with all appropriate linking.
    By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and
    spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already
    available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this
    case, fresh datasets will not be generated.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data : numpy.ndarray, dask.array.core.Array, list or tuple
        2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset.
        If creating an empty dataset - the dtype must be specified via a kwarg.
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character.
    quantity : String / Unicode
        Name of the physical quantity stored in the dataset. Example - 'Current'
    units : String / Unicode
        Name of units for the quantity stored in the dataset. Example - 'A' for amperes
    pos_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Position indices and values datasets
    spec_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Spectroscopic indices and values datasets
    main_dset_attrs : dictionary, Optional
        Dictionary of parameters that will be written to the main dataset. Do NOT include region references here.
    h5_pos_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Indices"
    h5_pos_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Values"
    h5_spec_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Indices"
    h5_spec_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Values"
    aux_spec_prefix : str or unicode, Optional
        Default prefix for Spectroscopic datasets. Default = "Spectroscopic"
    aux_pos_prefix : str or unicode, Optional
        Default prefix for Position datasets. Default = "Position"
    verbose : bool, Optional, default=False
        If set to true - prints debugging logs
    slow_to_fast : bool, Optional. Default=False
        Set to True if the dimensions are arranged from slowest varying to fastest varying.
        Set to False otherwise.
    kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
        arguments this way

    Returns
    -------
    h5_main : USIDataset
        Reference to the main dataset

    """
    def __check_anc_before_creation(aux_prefix, dim_type='pos'):
        aux_prefix = validate_single_string_arg(aux_prefix,
                                                'aux_' + dim_type + '_prefix')
        if not aux_prefix.endswith('_'):
            aux_prefix += '_'
        if '-' in aux_prefix:
            warn(
                'aux_' + dim_type +
                ' should not contain the "-" character. Reformatted name from:{} to '
                '{}'.format(aux_prefix, aux_prefix.replace('-', '_')))
        aux_prefix = aux_prefix.replace('-', '_')
        for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']:
            if dset_name in h5_parent_group.keys():
                # TODO: What if the contained data was correct?
                raise KeyError(
                    'Dataset named: ' + dset_name +
                    ' already exists in group: '
                    '{}. Consider passing these datasets using kwargs (if they are correct) instead of providing the pos_dims and spec_dims arguments'
                    .format(h5_parent_group.name))
        return aux_prefix

    def __ensure_anc_in_correct_file(h5_inds, h5_vals, prefix):
        if h5_inds.file != h5_vals.file:
            raise ValueError('Provided ' + prefix +
                             ' datasets are present in different HDF5 files!')

        if h5_inds.file != h5_parent_group.file:
            # Need to copy over the anc datasets to the new group
            if verbose:
                print('Need to copy over ancillary datasets: {} and {} to '
                      'destination group: {} which is in a different HDF5 '
                      'file'.format(h5_inds, h5_vals, h5_parent_group))
            ret_vals = [
                copy_dataset(x, h5_parent_group, verbose=verbose)
                for x in [h5_inds, h5_vals]
            ]
        else:
            ret_vals = [h5_inds, h5_vals]
        return tuple(ret_vals)

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    quantity, units, main_data_name = validate_string_args(
        [quantity, units, main_data_name],
        ['quantity', 'units', 'main_data_name'])
    if verbose:
        print('quantity, units, main_data_name all OK')

    quantity = quantity.strip()
    units = units.strip()
    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn(
            'main_data_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    if isinstance(main_data, (list, tuple)):
        if not contains_integers(main_data, min_val=1):
            raise ValueError(
                'main_data if specified as a shape should be a list / tuple of integers >= 1'
            )
        if len(main_data) != 2:
            raise ValueError(
                'main_data if specified as a shape should contain 2 numbers')
        if 'dtype' not in kwargs:
            raise ValueError(
                'dtype must be included as a kwarg when creating an empty dataset'
            )
        _ = validate_dtype(kwargs.get('dtype'))
        main_shape = main_data
        if verbose:
            print('Selected empty dataset creation. OK so far')
    elif isinstance(main_data, (np.ndarray, da.core.Array)):
        if main_data.ndim != 2:
            raise ValueError('main_data should be a 2D array')
        main_shape = main_data.shape
        if verbose:
            print('Provided numpy or Dask array for main_data OK so far')
    else:
        raise TypeError(
            'main_data should either be a numpy array or a tuple / list with the shape of the data'
        )

    if h5_pos_inds is not None and h5_pos_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_pos_inds,
                              h5_pos_vals,
                              main_shape,
                              is_spectroscopic=False)
        if verbose:
            print(
                'The shapes of the provided h5 position indices and values are OK'
            )
        h5_pos_inds, h5_pos_vals = __ensure_anc_in_correct_file(
            h5_pos_inds, h5_pos_vals, 'Position')
    else:
        aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix,
                                                     dim_type='pos')
        pos_dims = validate_dimensions(pos_dims, dim_type='Position')
        validate_dims_against_main(main_shape,
                                   pos_dims,
                                   is_spectroscopic=False)
        if verbose:
            print('Passed all pre-tests for creating position datasets')
        h5_pos_inds, h5_pos_vals = write_ind_val_dsets(
            h5_parent_group,
            pos_dims,
            is_spectral=False,
            verbose=verbose,
            slow_to_fast=slow_to_fast,
            base_name=aux_pos_prefix)
        if verbose:
            print('Created position datasets!')

    if h5_spec_inds is not None and h5_spec_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_spec_inds,
                              h5_spec_vals,
                              main_shape,
                              is_spectroscopic=True)
        if verbose:
            print('The shapes of the provided h5 position indices and values '
                  'are OK')
        h5_spec_inds, h5_spec_vals = __ensure_anc_in_correct_file(
            h5_spec_inds, h5_spec_vals, 'Spectroscopic')
    else:
        aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix,
                                                      dim_type='spec')
        spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic')
        validate_dims_against_main(main_shape,
                                   spec_dims,
                                   is_spectroscopic=True)
        if verbose:
            print('Passed all pre-tests for creating spectroscopic datasets')
        h5_spec_inds, h5_spec_vals = write_ind_val_dsets(
            h5_parent_group,
            spec_dims,
            is_spectral=True,
            verbose=verbose,
            slow_to_fast=slow_to_fast,
            base_name=aux_spec_prefix)
        if verbose:
            print('Created Spectroscopic datasets')

    if h5_parent_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn(
                'This HDF5 file has been opened wth the "mpio" communicator. '
                'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed'
            )

    if isinstance(main_data, np.ndarray):
        # Case 1 - simple small dataset
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 data=main_data,
                                                 **kwargs)
        if verbose:
            print('Created main dataset with provided data')
    elif isinstance(main_data, da.core.Array):
        # Case 2 - Dask dataset
        # step 0 - get rid of any automated dtype specification:
        _ = kwargs.pop('dtype', None)
        # step 1 - create the empty dataset:
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 shape=main_data.shape,
                                                 dtype=main_data.dtype,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset: {} for writing Dask dataset: {}'.
                  format(h5_main, main_data))
            print(
                'Dask array will be written to HDF5 dataset: "{}" in file: "{}"'
                .format(h5_main.name, h5_main.file.filename))
        # Step 2 - now ask Dask to dump data to disk
        da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
        # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
    else:
        # Case 3 - large empty dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, main_data,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset for Main')

    write_simple_attrs(h5_main, {'quantity': quantity, 'units': units})
    if verbose:
        print('Wrote quantity and units attributes to main dataset')

    if isinstance(main_dset_attrs, dict):
        write_simple_attrs(h5_main, main_dset_attrs)
        if verbose:
            print('Wrote provided attributes to main dataset')

    write_book_keeping_attrs(h5_main)

    # make it main
    link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals)
    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    from ..usi_data import USIDataset
    return USIDataset(h5_main)