Beispiel #1
0
def is_editable_h5(h5_obj):
    """
    Returns True if the file containing the provided h5 object is in w or r+ modes

    Parameters
    ----------
    h5_obj : h5py.File, h5py.Group, or h5py.Dataset object
        h5py object

    Returns
    -------
    mode : bool
        True if the file containing the provided h5 object is in w or r+ modes

    """
    warn('pyUSID.io.hdf_utils.is_editable_h5 has been moved to '
         'sidpy.hdf.hdf_utils.is_editable_h5. This copy in pyUSID will'
         'be removed in future release. Please update your import statements')
    return hut.is_editable_h5(h5_obj)
Beispiel #2
0
def write_nsid_dataset(dataset,
                       h5_group,
                       main_data_name='',
                       verbose=False,
                       **kwargs):
    """
    Writes the provided sid dataset as a 'Main' dataset with all appropriate
    linking.

    Parameters
    ----------
    dataset : sidpy.Dataset
        Dataset to be written to HDF5 in NSID format
    h5_group : class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character
        Use this to provide better context about the dataset in the HDF5 file
    verbose : bool, Optional. Default = False
        Whether or not to write logs to standard out
    kwargs: dict
        additional keyword arguments passed on to h5py when writing data

    Return
    ------
    h5py dataset
    """
    if not isinstance(dataset, Dataset):
        raise TypeError('data to write should be sidpy Dataset')
    if not isinstance(h5_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should be a h5py.File or h5py.Group '
                        'object')
    if not isinstance(main_data_name, str):
        raise TypeError('main_data_name should be a string, but it instead  it'
                        ' is {}'.format(type(main_data_name)))

    if not is_editable_h5(h5_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    if not isinstance(main_data_name, str):
        raise TypeError('main_data_name must be a string')

    if main_data_name == '':
        if dataset.title.strip() == '':
            main_data_name = 'nDim_Data'
        else:
            main_data_name = dataset.title.split('/')[-1]

    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn('main_data_name should not contain the "-" character. Reformatted'
             ' name from:{} to '
             '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    h5_group = h5_group.create_group(main_data_name)

    write_book_keeping_attrs(h5_group)
    write_pynsid_book_keeping_attrs(h5_group)

    #####################
    # Write Main Dataset
    ####################
    if h5_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn('This HDF5 file has been opened wth the "mpio" communicator. '
                 'mpi4py does not allow creation of compressed datasets. '
                 'Compression kwarg has been removed')

    if main_data_name in h5_group:
        raise ValueError('h5 dataset of that name already exists, choose '
                         'different name or delete first')

    _ = kwargs.pop('dtype', None)

    # step 1 - create the empty dataset:
    h5_main = h5_group.create_dataset(main_data_name,
                                      shape=dataset.shape,
                                      dtype=dataset.dtype,
                                      **kwargs)
    if verbose:
        print('Created empty dataset: {} for writing Dask dataset: {}'
              ''.format(h5_main, dataset))
        print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"'
              ''.format(h5_main.name, h5_main.file.filename))
    # Step 2 - now ask Dask to dump data to disk
    da.to_hdf5(h5_main.file.filename, {h5_main.name: dataset})

    if verbose:
        print('Created dataset for Main')

    #################
    # Add Dimensions
    #################
    dimensional_dict = {}

    for i, this_dim in dataset._axes.items():
        if not isinstance(this_dim, Dimension):
            raise ValueError('Dimensions {} is not a sidpy Dimension')

        this_dim_dset = h5_group.create_dataset(this_dim.name,
                                                data=this_dim.values)
        attrs_to_write = {
            'name': this_dim.name,
            'units': this_dim.units,
            'quantity': this_dim.quantity,
            'dimension_type': this_dim.dimension_type.name
        }

        write_simple_attrs(this_dim_dset, attrs_to_write)
        dimensional_dict[i] = this_dim_dset

    attrs_to_write = {
        'quantity': dataset.quantity,
        'units': dataset.units,
        'main_data_name': dataset.title,
        'data_type': dataset.data_type.name,
        'modality': dataset.modality,
        'source': dataset.source
    }

    write_simple_attrs(h5_main, attrs_to_write)
    write_pynsid_book_keeping_attrs(h5_main)

    for attr_name in dir(dataset):
        attr_val = getattr(dataset, attr_name)
        if isinstance(attr_val, dict):
            if verbose:
                print('Writing attributes from property: {} of the '
                      'sidpy.Dataset'.format(attr_name))
            write_dict_to_h5_group(h5_group, attr_val, attr_name)

    # This will attach the dimensions
    nsid_data_main = link_as_main(h5_main, dimensional_dict)

    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    dataset.h5_dataset = nsid_data_main

    return nsid_data_main
Beispiel #3
0
def write_ind_val_dsets(h5_parent_group, dimensions, is_spectral=True, verbose=False, base_name=None,
                        slow_to_fast=False):
    """
    Creates h5py.Datasets for the position OR spectroscopic indices and values of the data.
    Remember that the contents of the dataset can be changed if need be after the creation of the datasets.
    For example if one of the spectroscopic dimensions (e.g. - Bias) was sinusoidal and not linear, The specific
    dimension in the Spectroscopic_Values dataset can be manually overwritten.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group` or :class:`h5py.File`
        Group under which the indices and values datasets will be created
    dimensions : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
    is_spectral : bool, optional. default = True
        Spectroscopic (True) or Position (False)
    verbose : Boolean, optional
        Whether or not to print statements for debugging purposes
    base_name : str or unicode, optional
        Prefix for the datasets. Default: 'Position' when is_spectral is False, 'Spectroscopic' otherwise
    slow_to_fast : bool, Optional. Default=False
        Set to True if the dimensions are arranged from slowest varying to fastest varying.
        Set to False otherwise.

    Returns
    -------
    h5_spec_inds : h5py.Dataset
        Dataset containing the position indices
    h5_spec_vals : h5py.Dataset
        Dataset containing the value at each position

    Notes
    -----
    `steps`, `initial_values`, `labels`, and 'units' must be the same length as
    `dimensions` when they are specified.

    Dimensions should be in the order from fastest varying to slowest.

    """
    if isinstance(dimensions, Dimension):
        dimensions = [dimensions]
    if not isinstance(dimensions, (list, np.ndarray, tuple)):
        raise TypeError('dimensions should be array-like ')
    if not np.all([isinstance(x, Dimension) for x in dimensions]):
        raise TypeError('dimensions should be a sequence of Dimension objects')

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should be a h5py.File or Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided h5 object is not valid / open')

    if base_name is not None:
        base_name = validate_single_string_arg(base_name, 'base_name')
        if not base_name.endswith('_'):
            base_name += '_'
    else:
        base_name = 'Position_'
        if is_spectral:
            base_name = 'Spectroscopic_'

    if not slow_to_fast:
        warn('In the future write_ind_val_dsets will default to requiring dimensions to be arranged from slowest to fastest varying')

    # check if the datasets already exist. If they do, there's no point in going any further
    for sub_name in ['Indices', 'Values']:
        if base_name + sub_name in h5_parent_group.keys():
            raise KeyError('Dataset: {} already exists in provided group: {}'.format(base_name + sub_name,
                                                                                     h5_parent_group.name))
    modes = [dim.mode for dim in dimensions]
    sing_mode = np.unique(modes)

    if sing_mode.size > 1:
        raise NotImplementedError('Cannot yet work on combinations of modes for Dimensions. Consider doing manually')

    sing_mode = sing_mode[0]

    if sing_mode == DimType.DEFAULT:
        if slow_to_fast:
            # Ensure that the dimensions are arranged from fast to slow instead
            dimensions = dimensions[::-1]
        indices, values = build_ind_val_matrices([dim.values for dim in dimensions],
                                                 is_spectral=is_spectral)

        # At this point, dimensions and unit values are arranged from fastest to slowest
        # We want dimensions to be arranged from slowest to fastest:
        rev_func = np.flipud if is_spectral else np.fliplr
        dimensions = dimensions[::-1]
        indices = rev_func(indices)
        values = rev_func(values)

    elif sing_mode == DimType.INCOMPLETE:
        lengths = np.unique([len(dim.values) for dim in dimensions])
        if len(lengths) > 1:
            raise ValueError('Values for dimensions not of same length')
        single_dim = np.arange(lengths[0], dtype=INDICES_DTYPE)
        indices = np.tile(single_dim, (2, 1)).T
        values = np.dstack(tuple([dim.values for dim in dimensions])).squeeze()

        if is_spectral:
            indices = indices.T
            values = values.T
    else:
        raise NotImplementedError('Cannot yet work on Dependent dimensions')

    if verbose:
        print('Indices:')
        print(indices)
        print('Values:')
        print(values)

    # Create the Datasets for both Indices and Values
    h5_indices = h5_parent_group.create_dataset(base_name + 'Indices', data=INDICES_DTYPE(indices), dtype=INDICES_DTYPE)
    h5_values = h5_parent_group.create_dataset(base_name + 'Values', data=VALUES_DTYPE(values), dtype=VALUES_DTYPE)

    for h5_dset in [h5_indices, h5_values]:
        write_simple_attrs(h5_dset, {'units': [x.units for x in dimensions], 'labels': [x.name for x in dimensions],
                                     'type': [dim.mode.value for dim in dimensions]})

    warn('pyUSID.io.hdf_utils.simple.write_ind_val_dsets no longer creates'
         'region references for each dimension. Please use '
         'pyUSID.io.reg_ref.write_region_references to manually create region '
         'references')

    return h5_indices, h5_values
Beispiel #4
0
def write_main_dataset(h5_parent_group,
                       main_data,
                       main_data_name,
                       quantity,
                       units,
                       data_type,
                       modality,
                       source,
                       dim_dict,
                       main_dset_attrs=None,
                       verbose=False,
                       slow_to_fast=False,
                       **kwargs):
    """

    #TODO: Suhas to think about this a lot more

    Writes the provided data as a 'Main' dataset with all appropriate linking.
    By default, the instructions for generating dimension should be provided as a dictionary containing pyNSID-Dimensions or 1-Dim datasets 
    The dimension-datasets can be shared with other main datasets; in this case, fresh datasets will not be generated.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data : numpy.ndarray, dask.array.core.Array, list or tuple
        2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset.
        If creating an empty dataset - the dtype must be specified via a kwarg.
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character.
    quantity : String / Unicode
        Name of the physical quantity stored in the dataset. Example - 'Current'
    units : String / Unicode
        Name of units for the quantity stored in the dataset. Example - 'A' for amperes
    data_type : `string : What kind of data this is. Example - image, image stack, video, hyperspectral image, etc.
    modality : `string : Experimental / simulation modality - scientific meaning of data. Example - photograph, TEM micrograph, SPM Force-Distance spectroscopy.
    source : `string : Source for dataset like the kind of instrument.
    dim_dict : Dictionary containing Dimension or h5PyDataset objects, that map each dimension to the specified dimension. E.g.
        {'0': position_X, '1': position_Y, 2: spectra} where position_X, position_Y, spectra can be either Dimensions or h5py datasets.

        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Position indices and values datasets
    main_dset_attrs: dictionary, Optional, default = None
        flat dictionary of data to be added to the dataset, 
    verbose : bool, Optional, default=False
        If set to true - prints debugging logs
    kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
        arguments this way

    Returns
    -------
    h5_main : NSIDataset
        Reference to the main dataset

    """

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    #####################
    # Validate Main Data
    #####################
    quantity, units, main_data_name, data_type, modality, source = validate_string_args(
        [quantity, units, main_data_name, data_type, modality, source], [
            'quantity', 'units', 'main_data_name', 'data_type', 'modality',
            'source'
        ])

    if verbose:
        print('quantity, units, main_data_name all OK')

    quantity = quantity.strip()
    units = units.strip()
    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn(
            'main_data_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    if isinstance(main_data, (list, tuple)):
        if not contains_integers(main_data, min_val=1):
            raise ValueError(
                'main_data if specified as a shape should be a list / tuple of integers >= 1'
            )
        if len(main_data) < 1:
            raise ValueError(
                'main_data if specified as a shape should contain at least 1 number for the singular dimension'
            )
        if 'dtype' not in kwargs:
            raise ValueError(
                'dtype must be included as a kwarg when creating an empty dataset'
            )
        _ = validate_dtype(kwargs.get('dtype'))
        main_shape = main_data
        if verbose:
            print('Selected empty dataset creation. OK so far')
    elif isinstance(main_data, (np.ndarray, da.core.Array)):
        main_shape = main_data.shape
        if verbose:
            print('Provided numpy or Dask array for main_data OK so far')
    else:
        raise TypeError(
            'main_data should either be a numpy array or a tuple / list with the shape of the data'
        )

    ######################
    # Validate Dimensions
    ######################
    # An N dimensional dataset should have N items in the dimension dictionary
    if len(dim_dict) != len(main_shape):
        raise ValueError(
            'Incorrect number of dimensions: {} provided to support main data, of shape: {}'
            .format(len(dim_dict), main_shape))
    if set(range(len(main_shape))) != set(dim_dict.keys()):
        raise KeyError('')

    if False in validate_main_dimensions(main_shape, dim_dict,
                                         h5_parent_group):
        print('Dimensions incorrect')
        return
    if verbose:
        print('Dimensions are correct!')

    #####################
    # Write Main Dataset
    ####################
    if h5_parent_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn(
                'This HDF5 file has been opened wth the "mpio" communicator. '
                'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed'
            )

    if main_data_name in h5_parent_group:
        print('Oops, dataset exits')
        #del h5_parent_group[main_data_name]
        return

    if isinstance(main_data, np.ndarray):
        # Case 1 - simple small dataset
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 data=main_data,
                                                 **kwargs)
        if verbose:
            print('Created main dataset with provided data')
    elif isinstance(main_data, da.core.Array):
        # Case 2 - Dask dataset
        # step 0 - get rid of any automated dtype specification:
        _ = kwargs.pop('dtype', None)
        # step 1 - create the empty dataset:
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 shape=main_data.shape,
                                                 dtype=main_data.dtype,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset: {} for writing Dask dataset: {}'.
                  format(h5_main, main_data))
            print(
                'Dask array will be written to HDF5 dataset: "{}" in file: "{}"'
                .format(h5_main.name, h5_main.file.filename))
        # Step 2 - now ask Dask to dump data to disk
        da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
        # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
    else:
        # Case 3 - large empty dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, main_data,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset for Main')

    #################
    # Add Dimensions
    #################
    dimensional_dict = {}
    for i, this_dim in dim_dict.items():
        if isinstance(this_dim, h5py.Dataset):
            this_dim_dset = this_dim
            if 'nsid_version' not in this_dim_dset.attrs:
                this_dim_dset.attrs['nsid_version'] = '0.0.1'
            #this_dim_dset[i] = this_dim
        elif isinstance(this_dim, Dimension):
            this_dim_dset = h5_parent_group.create_dataset(
                this_dim.name, data=this_dim.values)
            attrs_to_write = {
                'name': this_dim.name,
                'units': this_dim.units,
                'quantity': this_dim.quantity,
                'dimension_type': this_dim.dimension_type,
                'nsid_version': '0.0.1'
            }
            write_simple_attrs(this_dim_dset, attrs_to_write)

        else:
            print(i, ' not a good dimension')
            pass
        dimensional_dict[i] = this_dim_dset

    attrs_to_write = {
        'quantity': quantity,
        'units': units,
        'nsid_version': '0.0.1'
    }
    attrs_to_write['main_data_name'] = main_data_name
    attrs_to_write['data_type'] = data_type
    attrs_to_write['modality'] = modality
    attrs_to_write['source'] = source

    write_simple_attrs(h5_main, attrs_to_write)

    if verbose:
        print('Wrote dimensions and attributes to main dataset')

    if isinstance(main_dset_attrs, dict):
        write_simple_attrs(h5_main, main_dset_attrs)
        if verbose:
            print('Wrote provided attributes to main dataset')

    #ToDo: check if we need  write_book_keeping_attrs(h5_main)
    NSID_data_main = link_as_main(h5_main, dimensional_dict)
    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    return NSID_data_main  #NSIDataset(h5_main)
Beispiel #5
0
def write_main_dataset(h5_parent_group,
                       main_data,
                       main_data_name,
                       quantity,
                       units,
                       pos_dims,
                       spec_dims,
                       main_dset_attrs=None,
                       h5_pos_inds=None,
                       h5_pos_vals=None,
                       h5_spec_inds=None,
                       h5_spec_vals=None,
                       aux_spec_prefix='Spectroscopic_',
                       aux_pos_prefix='Position_',
                       verbose=False,
                       slow_to_fast=False,
                       **kwargs):
    """
    Writes the provided data as a 'Main' dataset with all appropriate linking.
    By default, the instructions for generating the ancillary datasets should be specified using the pos_dims and
    spec_dims arguments as dictionary objects. Alternatively, if both the indices and values datasets are already
    available for either/or the positions / spectroscopic, they can be specified using the keyword arguments. In this
    case, fresh datasets will not be generated.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group`
        Parent group under which the datasets will be created
    main_data : numpy.ndarray, dask.array.core.Array, list or tuple
        2D matrix formatted as [position, spectral] or a list / tuple with the shape for an empty dataset.
        If creating an empty dataset - the dtype must be specified via a kwarg.
    main_data_name : String / Unicode
        Name to give to the main dataset. This cannot contain the '-' character.
    quantity : String / Unicode
        Name of the physical quantity stored in the dataset. Example - 'Current'
    units : String / Unicode
        Name of units for the quantity stored in the dataset. Example - 'A' for amperes
    pos_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Position indices and values datasets
    spec_dims : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
        Object specifying the instructions necessary for building the Spectroscopic indices and values datasets
    main_dset_attrs : dictionary, Optional
        Dictionary of parameters that will be written to the main dataset. Do NOT include region references here.
    h5_pos_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Indices"
    h5_pos_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Position_Values"
    h5_spec_inds : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Indices"
    h5_spec_vals : h5py.Dataset, Optional
        Dataset that will be linked with the name "Spectroscopic_Values"
    aux_spec_prefix : str or unicode, Optional
        Default prefix for Spectroscopic datasets. Default = "Spectroscopic"
    aux_pos_prefix : str or unicode, Optional
        Default prefix for Position datasets. Default = "Position"
    verbose : bool, Optional, default=False
        If set to true - prints debugging logs
    slow_to_fast : bool, Optional. Default=False
        Set to True if the dimensions are arranged from slowest varying to fastest varying.
        Set to False otherwise.
    kwargs will be passed onto the creation of the dataset. Please pass chunking, compression, dtype, and other
        arguments this way

    Returns
    -------
    h5_main : USIDataset
        Reference to the main dataset

    """
    def __check_anc_before_creation(aux_prefix, dim_type='pos'):
        aux_prefix = validate_single_string_arg(aux_prefix,
                                                'aux_' + dim_type + '_prefix')
        if not aux_prefix.endswith('_'):
            aux_prefix += '_'
        if '-' in aux_prefix:
            warn(
                'aux_' + dim_type +
                ' should not contain the "-" character. Reformatted name from:{} to '
                '{}'.format(aux_prefix, aux_prefix.replace('-', '_')))
        aux_prefix = aux_prefix.replace('-', '_')
        for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']:
            if dset_name in h5_parent_group.keys():
                # TODO: What if the contained data was correct?
                raise KeyError(
                    'Dataset named: ' + dset_name +
                    ' already exists in group: '
                    '{}. Consider passing these datasets using kwargs (if they are correct) instead of providing the pos_dims and spec_dims arguments'
                    .format(h5_parent_group.name))
        return aux_prefix

    def __ensure_anc_in_correct_file(h5_inds, h5_vals, prefix):
        if h5_inds.file != h5_vals.file:
            raise ValueError('Provided ' + prefix +
                             ' datasets are present in different HDF5 files!')

        if h5_inds.file != h5_parent_group.file:
            # Need to copy over the anc datasets to the new group
            if verbose:
                print('Need to copy over ancillary datasets: {} and {} to '
                      'destination group: {} which is in a different HDF5 '
                      'file'.format(h5_inds, h5_vals, h5_parent_group))
            ret_vals = [
                copy_dataset(x, h5_parent_group, verbose=verbose)
                for x in [h5_inds, h5_vals]
            ]
        else:
            ret_vals = [h5_inds, h5_vals]
        return tuple(ret_vals)

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    quantity, units, main_data_name = validate_string_args(
        [quantity, units, main_data_name],
        ['quantity', 'units', 'main_data_name'])
    if verbose:
        print('quantity, units, main_data_name all OK')

    quantity = quantity.strip()
    units = units.strip()
    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn(
            'main_data_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    if isinstance(main_data, (list, tuple)):
        if not contains_integers(main_data, min_val=1):
            raise ValueError(
                'main_data if specified as a shape should be a list / tuple of integers >= 1'
            )
        if len(main_data) != 2:
            raise ValueError(
                'main_data if specified as a shape should contain 2 numbers')
        if 'dtype' not in kwargs:
            raise ValueError(
                'dtype must be included as a kwarg when creating an empty dataset'
            )
        _ = validate_dtype(kwargs.get('dtype'))
        main_shape = main_data
        if verbose:
            print('Selected empty dataset creation. OK so far')
    elif isinstance(main_data, (np.ndarray, da.core.Array)):
        if main_data.ndim != 2:
            raise ValueError('main_data should be a 2D array')
        main_shape = main_data.shape
        if verbose:
            print('Provided numpy or Dask array for main_data OK so far')
    else:
        raise TypeError(
            'main_data should either be a numpy array or a tuple / list with the shape of the data'
        )

    if h5_pos_inds is not None and h5_pos_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_pos_inds,
                              h5_pos_vals,
                              main_shape,
                              is_spectroscopic=False)
        if verbose:
            print(
                'The shapes of the provided h5 position indices and values are OK'
            )
        h5_pos_inds, h5_pos_vals = __ensure_anc_in_correct_file(
            h5_pos_inds, h5_pos_vals, 'Position')
    else:
        aux_pos_prefix = __check_anc_before_creation(aux_pos_prefix,
                                                     dim_type='pos')
        pos_dims = validate_dimensions(pos_dims, dim_type='Position')
        validate_dims_against_main(main_shape,
                                   pos_dims,
                                   is_spectroscopic=False)
        if verbose:
            print('Passed all pre-tests for creating position datasets')
        h5_pos_inds, h5_pos_vals = write_ind_val_dsets(
            h5_parent_group,
            pos_dims,
            is_spectral=False,
            verbose=verbose,
            slow_to_fast=slow_to_fast,
            base_name=aux_pos_prefix)
        if verbose:
            print('Created position datasets!')

    if h5_spec_inds is not None and h5_spec_vals is not None:
        # The provided datasets override fresh building instructions.
        validate_anc_h5_dsets(h5_spec_inds,
                              h5_spec_vals,
                              main_shape,
                              is_spectroscopic=True)
        if verbose:
            print('The shapes of the provided h5 position indices and values '
                  'are OK')
        h5_spec_inds, h5_spec_vals = __ensure_anc_in_correct_file(
            h5_spec_inds, h5_spec_vals, 'Spectroscopic')
    else:
        aux_spec_prefix = __check_anc_before_creation(aux_spec_prefix,
                                                      dim_type='spec')
        spec_dims = validate_dimensions(spec_dims, dim_type='Spectroscopic')
        validate_dims_against_main(main_shape,
                                   spec_dims,
                                   is_spectroscopic=True)
        if verbose:
            print('Passed all pre-tests for creating spectroscopic datasets')
        h5_spec_inds, h5_spec_vals = write_ind_val_dsets(
            h5_parent_group,
            spec_dims,
            is_spectral=True,
            verbose=verbose,
            slow_to_fast=slow_to_fast,
            base_name=aux_spec_prefix)
        if verbose:
            print('Created Spectroscopic datasets')

    if h5_parent_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn(
                'This HDF5 file has been opened wth the "mpio" communicator. '
                'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed'
            )

    if isinstance(main_data, np.ndarray):
        # Case 1 - simple small dataset
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 data=main_data,
                                                 **kwargs)
        if verbose:
            print('Created main dataset with provided data')
    elif isinstance(main_data, da.core.Array):
        # Case 2 - Dask dataset
        # step 0 - get rid of any automated dtype specification:
        _ = kwargs.pop('dtype', None)
        # step 1 - create the empty dataset:
        h5_main = h5_parent_group.create_dataset(main_data_name,
                                                 shape=main_data.shape,
                                                 dtype=main_data.dtype,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset: {} for writing Dask dataset: {}'.
                  format(h5_main, main_data))
            print(
                'Dask array will be written to HDF5 dataset: "{}" in file: "{}"'
                .format(h5_main.name, h5_main.file.filename))
        # Step 2 - now ask Dask to dump data to disk
        da.to_hdf5(h5_main.file.filename, {h5_main.name: main_data})
        # main_data.to_hdf5(h5_main.file.filename, h5_main.name)  # Does not work with python 2 for some reason
    else:
        # Case 3 - large empty dataset
        h5_main = h5_parent_group.create_dataset(main_data_name, main_data,
                                                 **kwargs)
        if verbose:
            print('Created empty dataset for Main')

    write_simple_attrs(h5_main, {'quantity': quantity, 'units': units})
    if verbose:
        print('Wrote quantity and units attributes to main dataset')

    if isinstance(main_dset_attrs, dict):
        write_simple_attrs(h5_main, main_dset_attrs)
        if verbose:
            print('Wrote provided attributes to main dataset')

    write_book_keeping_attrs(h5_main)

    # make it main
    link_as_main(h5_main, h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals)
    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    from ..usi_data import USIDataset
    return USIDataset(h5_main)
Beispiel #6
0
def write_nsid_dataset(dataset,
                       h5_group,
                       main_data_name='',
                       verbose=False,
                       **kwargs):
    """
        Writes the provided sid dataset as a 'Main' dataset with all appropriate linking.

        Parameters
        ----------
        dataset: main_data : sidpy Dataset
        h5_group : class:`h5py.Group`
            Parent group under which the datasets will be created
        main_data_name : String / Unicode
            Name to give to the main dataset. This cannot contain the '-' character.
        verbose: boolean
        kwargs: additional h5py parameters

        Return
        ------
        h5py dataset
    """
    if not isinstance(dataset, Dataset):
        raise ValueError('data to write should be sidpy Dataset')

    if not isinstance(h5_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or h5py.Group object')
    if not is_editable_h5(h5_group):
        raise ValueError('The provided file is not editable')
    if verbose:
        print('h5 group and file OK')

    if main_data_name == '':
        if dataset.title.strip() == '':
            main_data_name = 'nDim_Data'
        else:
            main_data_name = dataset.title.split('/')[-1]

    main_data_name = main_data_name.strip()
    if '-' in main_data_name:
        warn(
            'main_data_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(main_data_name, main_data_name.replace('-', '_')))
    main_data_name = main_data_name.replace('-', '_')

    #####################
    # Write Main Dataset
    ####################
    if h5_group.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn(
                'This HDF5 file has been opened wth the "mpio" communicator. '
                'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed'
            )

    print(h5_group, main_data_name)
    if main_data_name in h5_group:
        raise ValueError(
            'h5 dataset of that name already exists, choose different name or delete first'
        )

    _ = kwargs.pop('dtype', None)
    # step 1 - create the empty dataset:
    h5_main = h5_group.create_dataset(main_data_name,
                                      shape=dataset.shape,
                                      dtype=dataset.dtype,
                                      **kwargs)
    if verbose:
        print('Created empty dataset: {} for writing Dask dataset: {}'.format(
            h5_main, dataset))
        print('Dask array will be written to HDF5 dataset: "{}" in file: "{}"'.
              format(h5_main.name, h5_main.file.filename))
    # Step 2 - now ask Dask to dump data to disk
    da.to_hdf5(h5_main.file.filename, {h5_main.name: dataset})

    if verbose:
        print('Created dataset for Main')

    #################
    # Add Dimensions
    #################
    dimensional_dict = {}

    for i, this_dim in dataset.axes.items():
        if not isinstance(this_dim, Dimension):
            raise ValueError('Dimensions {} is not a sidpy Dimension')

        this_dim_dset = h5_group.create_dataset(this_dim.name,
                                                data=this_dim.values)
        attrs_to_write = {
            'name': this_dim.name,
            'units': this_dim.units,
            'quantity': this_dim.quantity,
            'dimension_type': this_dim.dimension_type,
            'nsid_version': version
        }
        write_simple_attrs(this_dim_dset, attrs_to_write)
        dimensional_dict[i] = this_dim_dset

    attrs_to_write = {
        'quantity': dataset.quantity,
        'units': dataset.units,
        'nsid_version': version,
        'main_data_name': dataset.title,
        'data_type': dataset.data_type,
        'modality': dataset.modality,
        'source': dataset.source
    }
    write_simple_attrs(h5_main, attrs_to_write)
    # dset = write_main_dataset(h5_group, np.array(dataset), main_data_name,
    #                          dataset.quantity, dataset.units, dataset.data_type, dataset.modality,
    #                          dataset.source, dataset.axes, verbose=False)

    for key, item in dataset.attrs.items():
        if key not in attrs_to_write:
            # TODO: Check item to be simple
            h5_main.attrs[key] = item

    original_group = h5_group.create_group('original_metadata')
    for key, item in dataset.original_metadata.items():
        original_group.attrs[key] = item

    if hasattr(dataset, 'aberrations'):
        aberrations_group = h5_group.create_group('aberrations')
        for key, item in dataset.aberrations.items():
            aberrations_group.attrs[key] = item

    if hasattr(dataset, 'annotations'):
        annotations_group = h5_group.create_group('annotations')
        for key, item in dataset.annotations.items():
            annotations_group.attrs[key] = item

    # ToDo: check if we need  write_book_keeping_attrs(h5_main)
    # This will attach the dimensions
    nsid_data_main = link_as_main(h5_main, dimensional_dict)
    if verbose:
        print('Successfully linked datasets - dataset should be main now')

    return nsid_data_main  # NSIDataset(h5_main)