コード例 #1
0
ファイル: hdf_utils.py プロジェクト: ziatdinovmax/sidpy
def link_h5_obj_as_alias(h5_main, h5_ancillary, alias_name):
    """
    Creates Dataset attributes that contain references to other Dataset Objects.
    This function is useful when the reference attribute must have a reserved name.
    Such as linking 'SHO_Indices' as 'Spectroscopic_Indices'

    Parameters
    ------------
    h5_main : h5py.Dataset
        Reference to the the object to which attributes will be added
    h5_ancillary : h5py.Dataset
        object whose reference that can be accessed from src.attrs
    alias_name : String
        Alias / alternate name for trg

    """
    if not isinstance(h5_main, (h5py.Dataset, h5py.File, h5py.Group)):
        raise TypeError(
            'h5_main should either be a h5py Dataset, File, or Group')
    if not isinstance(h5_ancillary, (h5py.Dataset, h5py.Group)):
        raise TypeError(
            'h5_ancillary should be a h5py. Dataset or Group object')
    alias_name = validate_single_string_arg(alias_name, 'alias_name')

    __link_h5_obj(h5_main, h5_ancillary, alias=alias_name)
コード例 #2
0
    def __init__(self, file_path, *args, **kwargs):
        """
        Parameters
        -----------
        file_path : str
            Path to the file that needs to be read
            
        Attributes
        ----------
        self._input_file_path : str
            Path to the file that will be read

        Notes
        -----
        * This method will check to make sure that the provided file_path is
          indeed a string and a valid file path.
        * Consider calling ``can_read()`` within ``__init__()`` for validating
          the provided file

        Raises
        ------
        FileNotFoundError
        """
        file_path = validate_single_string_arg(file_path, 'file_path')
        if not os.path.exists(file_path):
            raise FileNotFoundError(file_path + ' does not exist')
        self._input_file_path = file_path
コード例 #3
0
ファイル: hdf_utils.py プロジェクト: ziatdinovmax/sidpy
def find_dataset(h5_group, dset_name):
    """
    Uses visit() to find all datasets with the desired name

    Parameters
    ----------
    h5_group : :class:`h5py.Group`
        Group to search within for the Dataset
    dset_name : str
        Name of the dataset to search for

    Returns
    -------
    datasets : list
        List of [Name, object] pairs corresponding to datasets that match `ds_name`.

    """
    if not isinstance(h5_group, (h5py.File, h5py.Group)):
        raise TypeError('h5_group should be a h5py.File or h5py.Group object')
    dset_name = validate_single_string_arg(dset_name, 'dset_name')

    # print 'Finding all instances of', ds_name
    datasets = []

    def __find_name(name, obj):
        if dset_name in name.split('/')[-1] and isinstance(obj, h5py.Dataset):
            datasets.append(obj)
        return

    h5_group.visititems(__find_name)

    return datasets
コード例 #4
0
ファイル: dict_utils.py プロジェクト: saimani5/sidpy
def flatten_dict(nested_dict, separator='-'):
    """
    Flattens a nested dictionary

    Parameters
    ----------
    nested_dict : dict
        Nested dictionary
    separator : str, Optional. Default='-'
        Separator between the keys of different levels

    Returns
    -------
    dict
        Dictionary whose keys are flattened to a single level
    Notes
    -----
    Taken from https://stackoverflow.com/questions/6027558/flatten-nested-
    dictionaries-compressing-keys
    """
    if not isinstance(nested_dict, dict):
        raise TypeError('nested_dict should be a dict')
    separator = validate_single_string_arg(separator, 'separator')

    def __flatten_dict_int(nest_dict, sep, parent_key=''):
        items = []
        if sep == '_':
            repl = '-'
        else:
            repl = '_'
        for key, value in nest_dict.items():
            if not isinstance(key, str):
                key = str(key)
            if sep in key:
                key = key.replace(sep, repl)

            new_key = parent_key + sep + key if parent_key else key
            if isinstance(value, MutableMapping):
                items.extend(
                    __flatten_dict_int(value, sep, parent_key=new_key).items())
            # nion files contain lists of dictionaries, oops
            elif isinstance(value, list):
                for i in range(len(value)):
                    if isinstance(value[i], dict):
                        for kk in value[i]:
                            items.append(
                                ('dim-' + kk + '-' + str(i), value[i][kk]))
                    else:
                        if type(value) != bytes:
                            items.append((new_key, value))
            else:
                if type(value) != bytes:
                    items.append((new_key, value))
        return dict(items)

    return __flatten_dict_int(nested_dict, separator)
コード例 #5
0
ファイル: simple.py プロジェクト: rajgiriUW/pyUSID
def create_results_group(h5_main, tool_name, h5_parent_group=None):
    """
    Creates a h5py.Group object autoindexed and named as 'DatasetName-ToolName_00x'

    Parameters
    ----------
    h5_main : h5py.Dataset object
        Reference to the dataset based on which the process / analysis is being performed
    tool_name : string / unicode
        Name of the Process / Analysis applied to h5_main
    h5_parent_group : h5py.Group, optional. Default = None
        Parent group under which the results group will be created. Use this
        option to write results into a new HDF5 file. By default, results will
        be written into the same group containing `h5_main`

    Returns
    -------
    h5_group : :class:`h5py.Group`
        Results group which can now house the results datasets

    """
    if not isinstance(h5_main, h5py.Dataset):
        raise TypeError('h5_main should be a h5py.Dataset object')
    if h5_parent_group is not None:
        if not isinstance(h5_parent_group, (h5py.File, h5py.Group)):
            raise TypeError("'h5_parent_group' should either be a h5py.File "
                            "or h5py.Group object")
    else:
        h5_parent_group = h5_main.parent

    tool_name = validate_single_string_arg(tool_name, 'tool_name')

    if '-' in tool_name:
        warn('tool_name should not contain the "-" character. Reformatted name from:{} to '
             '{}'.format(tool_name, tool_name.replace('-', '_')))
    tool_name = tool_name.replace('-', '_')

    group_name = h5_main.name.split('/')[-1] + '-' + tool_name + '_'
    group_name = assign_group_index(h5_parent_group, group_name)

    h5_group = h5_parent_group.create_group(group_name)

    write_book_keeping_attrs(h5_group)

    # Also add some basic attributes like source and tool name. This will allow relaxation of nomenclature restrictions:
    # this are NOT being used right now but will be in the subsequent versions of pyUSID
    write_simple_attrs(h5_group, {'tool': tool_name, 'num_source_dsets': 1})
    # in this case, there is only one source
    if h5_parent_group.file == h5_main.file:
        for dset_ind, dset in enumerate([h5_main]):
            h5_group.attrs['source_' + '{:03d}'.format(dset_ind)] = dset.ref

    return h5_group
コード例 #6
0
ファイル: hdf_utils.py プロジェクト: saimani5/sidpy
def write_dict_to_h5_group(h5_group, metadata, group_name):
    """
    If the provided metadata parameter is a non-empty dictionary, this function
    will create a HDF5 group called group_name within the provided h5_group and
    write the contents of metadata into the newly created group
    Parameters
    ----------
    h5_group : h5py.Group
        Parent group to write metadata into
    metadata : dict
        Dictionary that needs to be written into the group
    group_name : str
        Name of the group to write attributes into


    Returns
    -------
    h5_metadata_grp : h5py.Group
        Handle to the newly create group containing the metadata

    Notes
    -----
    Nested dictionaries will be flattened until sidpy implements functions
    to write and read nested dictionaries to and from HDF5 files
    """
    if not isinstance(metadata, dict):
        raise TypeError('metadata is not a dict but of type: {}'
                        ''.format(type(metadata)))
    if len(metadata) < 1:
        return None
    if not isinstance(h5_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_group is neither a h5py.Group or h5py.File object'
                        'and is of type: {}'.format(type(h5_group)))

    validate_single_string_arg(group_name, 'group_name')
    group_name = group_name.replace(' ', '_')
    h5_md_group = h5_group.create_group(group_name)
    flat_dict = flatten_dict(metadata)
    write_simple_attrs(h5_md_group, flat_dict)
    return h5_md_group
コード例 #7
0
    def is_valid_file(file_path, *args, **kwargs):
        """
        Checks whether the provided file can be read by this translator.

        This basic function compares the file extension against the "extension"
        keyword argument. If the extension matches, this function returns True

        Parameters
        ----------
        file_path : str
            Path to raw data file

        Returns
        -------
        file_path : str
            Path to the file that needs to be provided to translate()
            if the provided file was indeed a valid file
            Else, None
        """
        file_path = validate_single_string_arg(file_path, 'file_name')

        if not os.path.exists(file_path):
            raise FileNotFoundError(file_path + ' does not exist')

        targ_ext = kwargs.get('extension', None)
        if not targ_ext:
            raise NotImplementedError('Either is_valid_file() has not been '
                                      'implemented by this translator or the '
                                      '"extension" keyword argument was '
                                      'missing')
        if isinstance(targ_ext, (str, unicode)):
            targ_ext = [targ_ext]
        targ_ext = validate_list_of_strings(targ_ext,
                                            parm_name='(keyword argument) '
                                            '"extension"')

        # Get rid of any '.' separators that may be in the list of extensions
        # Also turn to lower case for case insensitive comparisons
        targ_ext = [item.replace('.', '').lower() for item in targ_ext]

        file_path = os.path.abspath(file_path)
        extension = os.path.splitext(file_path)[1][1:]

        # Ensure extension is lower case just like targets above
        extension = extension.lower()

        if extension in targ_ext:
            return file_path
        else:
            return None
コード例 #8
0
    def dimension_type(self, value):
        if isinstance(value, DimensionTypes):
            self._dimension_type = value
        else:
            dimension_type = validate_single_string_arg(
                value, 'dimension_type')

            if dimension_type.upper() in DimensionTypes._member_names_:
                self._dimension_type = DimensionTypes[dimension_type.upper()]
            elif dimension_type.lower() in ['frame', 'time', 'stack']:
                self._dimension_type = DimensionTypes.TEMPORAL
            else:
                self._dimension_type = DimensionTypes.UNKNOWN
                print('Supported dimension_types for plotting are only: ',
                      DimensionTypes._member_names_)
                print('Setting DimensionTypes to UNKNOWN')
コード例 #9
0
ファイル: prov_utils.py プロジェクト: saimani5/sidpy
def assign_group_index(h5_parent_group, base_name, verbose=False):
    """
    Searches the parent h5 group to find the next available index for the group

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group` object
        Parent group under which the new group object will be created
    base_name : str or unicode
        Base name of the new group without index
    verbose : bool, optional. Default=False
        Whether or not to print debugging statements

    Returns
    -------
    base_name : str or unicode
        Base name of the new group with the next available index as a suffix
    """
    if not isinstance(h5_parent_group, h5py.Group):
        raise TypeError('h5_parent_group should be a h5py.Group object')
    base_name = validate_single_string_arg(base_name, 'base_name')

    if len(base_name) == 0:
        raise ValueError('base_name should not be an empty string')

    if not base_name.endswith('_'):
        base_name += '_'

    temp = [key for key in h5_parent_group.keys()]
    if verbose:
        print(
            'Looking for group names starting with {} in parent containing items: '
            '{}'.format(base_name, temp))
    previous_indices = []
    for item_name in temp:
        if isinstance(h5_parent_group[item_name],
                      h5py.Group) and item_name.startswith(base_name):
            previous_indices.append(int(item_name.replace(base_name, '')))
    previous_indices = np.sort(previous_indices)
    if verbose:
        print('indices of existing groups with the same prefix: {}'.format(
            previous_indices))
    if len(previous_indices) == 0:
        index = 0
    else:
        index = previous_indices[-1] + 1
    return base_name + '{:03d}'.format(index)
コード例 #10
0
ファイル: write_utils.py プロジェクト: nccreang/pyUSID
    def __init__(self, name, units, values, mode=DimType.DEFAULT):
        """
        Simple object that describes a dimension in a dataset by its name, units, and values

        Parameters
        ----------
        name : str or unicode
            Name of the dimension. For example 'Bias'
        units : str or unicode
            Units for this dimension. For example: 'V'
        values : array-like or int
            Values over which this dimension was varied. A linearly increasing set of values will be generated if an
            integer is provided instead of an array.
        mode : Enum, Optional. Default = DimType.DEFAULT
            How the parameter associated with the dimension was varied.
            DimType.DEFAULT - data was recorded for all combinations of values in this dimension against **all** other
            dimensions. This is typically the case.
            DimType.INCOMPLETE - Data not present for all combinations of values in this dimension and all other
                dimensions. Examples include spiral scans, sparse sampling, aborted measurements
            DimType.DEPENDENT - Values in this dimension were varied as a function of another (independent) dimension.
        """
        name = validate_single_string_arg(name, 'name')

        if not isinstance(units, (str, unicode)):
            raise TypeError('units should be a string')
        units = units.strip()

        if isinstance(values, int):
            if values < 1:
                raise ValueError(
                    'values should at least be specified as a positive integer'
                )
            values = np.arange(values)
        if not isinstance(values, (np.ndarray, list, tuple)):
            raise TypeError('values should be array-like')

        if not isinstance(mode, DimType):
            raise TypeError(
                'mode must be of type pyUSID.DimType. Provided object was of type: {}'
                .format(type(mode)))

        self.name = name
        self.units = units
        self.values = values
        self.mode = mode
コード例 #11
0
    def dimension_type(self, value):
        if isinstance(value, DimensionType):
            self._dimension_type = value
        else:
            dimension_type = validate_single_string_arg(
                value, 'dimension_type')

            if dimension_type.upper() in [
                    member.name for member in DimensionType
            ]:
                self._dimension_type = DimensionType[dimension_type.upper()]
            elif dimension_type.lower() in ['frame', 'time', 'stack']:
                self._dimension_type = DimensionType.TEMPORAL
            else:
                self._dimension_type = DimensionType.UNKNOWN
                warn('Supported dimension types for plotting are only: {}'
                     ''.format([member.name for member in DimensionType]))
                warn('Setting DimensionType to UNKNOWN')
コード例 #12
0
ファイル: prov_utils.py プロジェクト: saimani5/sidpy
def find_results_groups(h5_main, tool_name, h5_parent_group=None):
    """
    Finds a list of all groups containing results of the process of name
    ``tool_name`` being applied to the dataset

    Parameters
    ----------
    h5_main : h5 dataset reference
        Reference to the target dataset to which the tool was applied
    tool_name : String / unicode
        Name of the tool applied to the target dataset
    h5_parent_group : h5py.Group, optional. Default = None
        Parent group under which the results group will be searched for. Use
        this option when the results groups are contained in different HDF5
        file compared to `h5_main`. BY default, this function will search
        within the same group that contains `h5_main`

    Returns
    -------
    groups : list of references to :class:`h5py.Group` objects
        groups whose name contains the tool name and the dataset name
    """
    warn(
        'The behavior of find_results_group is very likely to change soon '
        'and significantly. Use this function with caution', FutureWarning)

    if not isinstance(h5_main, h5py.Dataset):
        raise TypeError('h5_main should be a h5py.Dataset object')
    tool_name = validate_single_string_arg(tool_name, 'tool_name')

    if h5_parent_group is not None:
        if not isinstance(h5_parent_group, (h5py.File, h5py.Group)):
            raise TypeError("'h5_parent_group' should either be a h5py.File "
                            "or h5py.Group object")
    else:
        h5_parent_group = h5_main.parent

    dset_name = h5_main.name.split('/')[-1]
    groups = []
    for key in h5_parent_group.keys():
        if dset_name in key and tool_name in key and isinstance(
                h5_parent_group[key], h5py.Group):
            groups.append(h5_parent_group[key])
    return groups
コード例 #13
0
ファイル: hdf_utils.py プロジェクト: saimani5/sidpy
def get_attr(h5_object, attr_name):
    """
    Returns the attribute from the h5py object

    Parameters
    ----------
    h5_object : :class:`h5py.Dataset`, :class:`h5py.Group` or :class:`h5py.File`
        object whose attribute is desired
    attr_name : str
        Name of the attribute of interest

    Returns
    -------
    att_val : object
        value of attribute, in certain cases (byte strings or list of byte strings) reformatted to readily usable forms

    """
    if not isinstance(h5_object, (h5py.Dataset, h5py.Group, h5py.File)):
        raise TypeError(
            'h5_object should be a h5py.Dataset, h5py.Group or h5py.File object'
        )

    attr_name = validate_single_string_arg(attr_name, 'attr_name')

    if attr_name not in h5_object.attrs.keys():
        raise KeyError("'{}' is not an attribute in '{}'".format(
            attr_name, h5_object.name))

    h5py_major = int(h5py.__version__.split('.')[0])

    att_val = h5_object.attrs.get(attr_name)
    if isinstance(att_val, np.bytes_) or isinstance(att_val, bytes):
        att_val = att_val.decode('utf-8')

    elif isinstance(att_val, np.ndarray):
        if sys.version_info.major == 3:
            if att_val.dtype.type in [np.bytes_]:
                att_val = np.array([str(x, 'utf-8') for x in att_val])
            elif att_val.dtype.type in [np.object_] and h5py_major < 3:
                att_val = np.array([str(x, 'utf-8') for x in att_val])

    return att_val
コード例 #14
0
 def __check_anc_before_creation(aux_prefix, dim_type='pos'):
     aux_prefix = validate_single_string_arg(aux_prefix,
                                             'aux_' + dim_type + '_prefix')
     if not aux_prefix.endswith('_'):
         aux_prefix += '_'
     if '-' in aux_prefix:
         warn(
             'aux_' + dim_type +
             ' should not contain the "-" character. Reformatted name from:{} to '
             '{}'.format(aux_prefix, aux_prefix.replace('-', '_')))
     aux_prefix = aux_prefix.replace('-', '_')
     for dset_name in [aux_prefix + 'Indices', aux_prefix + 'Values']:
         if dset_name in h5_parent_group.keys():
             # TODO: What if the contained data was correct?
             raise KeyError(
                 'Dataset named: ' + dset_name +
                 ' already exists in group: '
                 '{}. Consider passing these datasets using kwargs (if they are correct) instead of providing the pos_dims and spec_dims arguments'
                 .format(h5_parent_group.name))
     return aux_prefix
コード例 #15
0
ファイル: prov_utils.py プロジェクト: saimani5/sidpy
def create_indexed_group(h5_parent_group, base_name):
    """
    Creates a group with an indexed name (eg - 'Measurement_012') under
    ``h5_parent_group`` using the provided ``base_name`` as a prefix for the
    group's name

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group` or :class:`h5py.File`
        File or group within which the new group will be created
    base_name : str or unicode
        Prefix for the group name. This need not end with a '_'. It will be
        added automatically
    """
    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError(
            'h5_parent_group should be a h5py.File or Group object')
    base_name = validate_single_string_arg(base_name, 'base_name')

    group_name = assign_group_index(h5_parent_group, base_name)
    h5_new_group = h5_parent_group.create_group(group_name)
    write_book_keeping_attrs(h5_new_group)
    return h5_new_group
コード例 #16
0
    def __init__(self,
                 h5_main,
                 process_name,
                 parms_dict=None,
                 cores=None,
                 max_mem_mb=4 * 1024,
                 mem_multiplier=1.0,
                 lazy=False,
                 h5_target_group=None,
                 verbose=False):
        """
        Parameters
        ----------
        h5_main : :class:`~pyUSID.io.usi_data.USIDataset`
            The USID main HDF5 dataset over which the analysis will be performed.
        process_name : str
            Name of the process
        cores : uint, optional
            How many cores to use for the computation. Default: all available cores - 2 if operating outside MPI context
        max_mem_mb : uint, optional
            How much memory to use for the computation.  Default 1024 Mb
        mem_multiplier : float, optional. Default = 1
            mem_multiplier is the number that will be multiplied with the
            (byte) size of a single position in the source dataset in order to
            better estimate the number of positions that can be processed at
            any given time (how many pixels of the source and results datasets
            can be retained in memory). The default value of 1.0 only accounts
            for the source dataset. A value greater than 1 would account for
            the size of results datasets as well. For example, if the result
            dataset is the same size and precision as the source dataset,
            the multiplier will be 2 (1 for source, 1 for result)
        lazy : bool, optional. Default = False
            If True, read_data_chunk and write_results_chunk will operate on
            dask arrays. If False - everything will be in numpy.
        h5_target_group : h5py.Group, optional. Default = None
            Location where to look for existing results and to place newly
            computed results. Use this kwarg if the results need to be written
            to a different HDF5 file. By default, this value is set to the
            parent group containing `h5_main`
        verbose : bool, Optional, default = False
            Whether or not to print debugging statements

        Attributes
        ----------
        self.h5_results_grp : :class:`h5py.Group`
            HDF5 group containing the HDF5 datasets that contain the results
            of the computation
        self.verbose : bool
            Whether or not to print debugging statements
        self.parms_dict : dict
            Dictionary of parameters for the computation
        self.duplicate_h5_groups : list
            List of :class:`h5py.Group` objects containing computational
            results that have been completely computed with the same
            set of parameters as those in self.parms_dict
        self.partial_h5_groups : list
            List of :class:`h5py.Group` objects containing computational
            results that have been partially computed with the same
            set of parameters as those in self.parms_dict
        self.process_name : str
            Name of the process. This is used for checking for existing
            completely and partially computed results as well as for naming
            the HDF5 group that will contain the results of the computation
        self._cores : uint
            Number of CPU cores to use for parallel computations.
            Ignored in the MPI context. Each rank gets 1 CPU core
        self._max_pos_per_read : uint
            Number of positions in the dataset to read per chunk
        self._status_dset_name : str
            Name of the HDF5 dataset that keeps track of the positions in the
            source dataset thave already been computed
        self._results : list
            List of objects returned as the result of computation performed by
            the self._map_function for each position in the current batch of
            positions that were processed
        self._h5_target_group : h5py.Group
            Location where existing / future results will be stored
        self.__resume_implemented : bool
            Whether or not this (child) class has implemented the
            self._get_existing_datasets() function
        self.__bytes_per_pos : uint
            Number of bytes used by one position of the source dataset
        self.mpi_comm : :class:`mpi4py.MPI.COMM_WORLD`
            MPI communicator. None if not running in an MPI context
        self.mpi_rank: uint
            MPI rank. Always 0 if not running in an MPI context
        self.mpi_size: uint
            Number of ranks in COMM_WORLD. 1 if not running in an MPI context
        self.__ranks_on_socket : uint
            Number of MPI ranks on a given CPU socket
        self.__socket_master_rank : uint
            Master MPI rank for a given CPU chip / socket
        self.__compute_jobs : array-like
            List of positions in the HDF5 dataset that need to be computed.
            This may not be a continuous list of numbers if multiple MPI
            workers had previously started computing and were interrupted.
        self.__start_pos : uint
            The index within self.__compute_jobs that a particular MPI rank /
            worker needs to start computing from.
        self.__rank_end_pos : uint
            The index within self.__compute_jobs that a particular MPI rank /
            worker needs to start computing till.
        self.__end_pos : uint
            The index within self.__compute_jobs that a particular MPI rank /
            worker needs to start computing till for the current batch of
            positions.
        self.__pixels_in_batch : array-like
            The positions being computed on by the current compute worker
        """

        if h5_main.file.mode != 'r+':
            raise TypeError(
                'Need to ensure that the file is in r+ mode to write results back to the file'
            )

        MPI = get_MPI()

        # Ensure that the file is opened in the correct comm or something
        if MPI is not None and h5_main.file.driver != 'mpio':
            warn('Code was called in MPI context but HDF5 file was not opened '
                 'with the "mpio" driver. JobLib will be used instead of MPI '
                 'for parallel computation')
            MPI = None

        if MPI is not None:
            # If we came here then, the user has intentionally asked for multi-node computation
            comm = MPI.COMM_WORLD
            self.mpi_comm = comm
            self.mpi_rank = comm.Get_rank()
            self.mpi_size = comm.Get_size()

            if verbose:
                print(
                    "Rank {} of {} on {} sees {} logical cores on the socket".
                    format(comm.Get_rank(), comm.Get_size(),
                           MPI.Get_processor_name(), cpu_count()))

            # First, ensure that cores=logical cores in node. No point being economical / considerate
            cores = psutil.cpu_count()

            # It is sufficient if just one rank checks all this.
            if self.mpi_rank == 0:
                print('Working on {} ranks via MPI'.format(self.mpi_size))

            if verbose and self.mpi_rank == 0:
                print('Finished getting all necessary MPI information')
            """
            # Not sure how to check for this correctly
            messg = None
            try:
                if h5_main.file.comm != comm:
                    messg = 'The HDF5 file should have been opened with comm=MPI.COMM_WORLD. Currently comm={}'
                            ''.format(h5_main.file.comm)
            except AttributeError:
                messg = 'The HDF5 file should have been opened with comm=MPI.COMM_WORLD'
            if messg is not None:
                raise TypeError(messg)
            """

        else:
            if verbose:
                print(
                    'No mpi4py found or script was not called via mpixexec / mpirun. '
                    'Assuming single node computation')
            self.mpi_comm = None
            self.mpi_size = 1
            self.mpi_rank = 0

        # Checking if dataset is "Main"
        if not check_if_main(h5_main, verbose=verbose and self.mpi_rank == 0):
            raise ValueError(
                'Provided dataset is not a "Main" dataset with necessary ancillary datasets'
            )

        if h5_target_group is not None:
            if not isinstance(h5_target_group, (h5py.Group, h5py.File)):
                raise TypeError(
                    "'h5_target_group' must be a h5py.Group object")
        else:
            h5_target_group = h5_main.parent
        self._h5_target_group = h5_target_group

        process_name = validate_single_string_arg(process_name, 'process_name')

        if parms_dict is None:
            parms_dict = {}
        else:
            if not isinstance(parms_dict, dict):
                raise TypeError("Expected 'parms_dict' of type: dict")

        if MPI is not None:
            MPI.COMM_WORLD.barrier()
        # Not sure if we need a barrier here.

        if verbose and self.mpi_rank == 0:
            print(
                'Rank {}: Upgrading from a regular h5py.Dataset to a USIDataset'
                .format(self.mpi_rank))

        # Generation of N-dimensional form would break things for some reason.
        self.h5_main = USIDataset(h5_main)

        if verbose and self.mpi_rank == 0:
            print('Rank {}: The HDF5 dataset is now a USIDataset'.format(
                self.mpi_rank))

        # Saving these as properties of the object:
        self.verbose = verbose
        self.__lazy = lazy
        self._cores = None
        self.__ranks_on_socket = 1
        self.__socket_master_rank = 0
        self._max_pos_per_read = None
        self.__bytes_per_pos = None

        # Now have to be careful here since the below properties are a function of the MPI rank
        self.__start_pos = None
        self.__rank_end_pos = None
        self.__end_pos = None
        self.__pixels_in_batch = None
        self.__compute_jobs = None

        # Determining the max size of the data that can be put into memory
        # all ranks go through this and they need to have this value any
        self._set_memory_and_cores(cores=cores,
                                   man_mem_limit=max_mem_mb,
                                   mem_multiplier=mem_multiplier)
        if verbose and self.mpi_rank == 0:
            print('Finished collecting info on memory and workers')
        self.duplicate_h5_groups = []
        self.partial_h5_groups = []
        self.process_name = process_name  # Reset this in the extended classes
        self.parms_dict = parms_dict
        """
        The name of the HDF5 dataset that should be present to signify which positions have already been computed
        This is NOT a fully private variable so that multiple processes can be run within a single group - Eg Fitter
        In the case of Fitter - this name can be changed from 'completed_guesses' to 'completed_fits'
        check_for_duplicates will be called by the Child class where they have the opportunity to change this
        variable before checking for duplicates
        """
        self._status_dset_name = 'completed_positions'

        self._results = None
        self.h5_results_grp = None

        # Check to see if the resuming feature has been implemented:
        self.__resume_implemented = False
        try:
            self._get_existing_datasets()
        except NotImplementedError:
            if verbose and self.mpi_rank == 0:
                print(
                    'It appears that this class may not be able to resume computations'
                )
        except:
            # NameError for variables that don't exist
            # AttributeError for self.var_name that don't exist
            # TypeError (NoneType) etc.
            self.__resume_implemented = True

        if self.mpi_rank == 0:
            print(
                'Consider calling test() to check results before calling compute() which computes on the entire'
                ' dataset and writes results to the HDF5 file')

        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )
コード例 #17
0
 def quantity(self, value):
     self._quantity = validate_single_string_arg(value, 'quantity')
コード例 #18
0
ファイル: simple.py プロジェクト: rajgiriUW/pyUSID
def write_ind_val_dsets(h5_parent_group, dimensions, is_spectral=True, verbose=False, base_name=None,
                        slow_to_fast=False):
    """
    Creates h5py.Datasets for the position OR spectroscopic indices and values of the data.
    Remember that the contents of the dataset can be changed if need be after the creation of the datasets.
    For example if one of the spectroscopic dimensions (e.g. - Bias) was sinusoidal and not linear, The specific
    dimension in the Spectroscopic_Values dataset can be manually overwritten.

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group` or :class:`h5py.File`
        Group under which the indices and values datasets will be created
    dimensions : Dimension or array-like of Dimension objects
        Sequence of Dimension objects that provides all necessary instructions for constructing the indices and values
        datasets
    is_spectral : bool, optional. default = True
        Spectroscopic (True) or Position (False)
    verbose : Boolean, optional
        Whether or not to print statements for debugging purposes
    base_name : str or unicode, optional
        Prefix for the datasets. Default: 'Position' when is_spectral is False, 'Spectroscopic' otherwise
    slow_to_fast : bool, Optional. Default=False
        Set to True if the dimensions are arranged from slowest varying to fastest varying.
        Set to False otherwise.

    Returns
    -------
    h5_spec_inds : h5py.Dataset
        Dataset containing the position indices
    h5_spec_vals : h5py.Dataset
        Dataset containing the value at each position

    Notes
    -----
    `steps`, `initial_values`, `labels`, and 'units' must be the same length as
    `dimensions` when they are specified.

    Dimensions should be in the order from fastest varying to slowest.

    """
    if isinstance(dimensions, Dimension):
        dimensions = [dimensions]
    if not isinstance(dimensions, (list, np.ndarray, tuple)):
        raise TypeError('dimensions should be array-like ')
    if not np.all([isinstance(x, Dimension) for x in dimensions]):
        raise TypeError('dimensions should be a sequence of Dimension objects')

    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should be a h5py.File or Group object')
    if not is_editable_h5(h5_parent_group):
        raise ValueError('The provided h5 object is not valid / open')

    if base_name is not None:
        base_name = validate_single_string_arg(base_name, 'base_name')
        if not base_name.endswith('_'):
            base_name += '_'
    else:
        base_name = 'Position_'
        if is_spectral:
            base_name = 'Spectroscopic_'

    if not slow_to_fast:
        warn('In the future write_ind_val_dsets will default to requiring dimensions to be arranged from slowest to fastest varying')

    # check if the datasets already exist. If they do, there's no point in going any further
    for sub_name in ['Indices', 'Values']:
        if base_name + sub_name in h5_parent_group.keys():
            raise KeyError('Dataset: {} already exists in provided group: {}'.format(base_name + sub_name,
                                                                                     h5_parent_group.name))
    modes = [dim.mode for dim in dimensions]
    sing_mode = np.unique(modes)

    if sing_mode.size > 1:
        raise NotImplementedError('Cannot yet work on combinations of modes for Dimensions. Consider doing manually')

    sing_mode = sing_mode[0]

    if sing_mode == DimType.DEFAULT:
        if slow_to_fast:
            # Ensure that the dimensions are arranged from fast to slow instead
            dimensions = dimensions[::-1]
        indices, values = build_ind_val_matrices([dim.values for dim in dimensions],
                                                 is_spectral=is_spectral)

        # At this point, dimensions and unit values are arranged from fastest to slowest
        # We want dimensions to be arranged from slowest to fastest:
        rev_func = np.flipud if is_spectral else np.fliplr
        dimensions = dimensions[::-1]
        indices = rev_func(indices)
        values = rev_func(values)

    elif sing_mode == DimType.INCOMPLETE:
        lengths = np.unique([len(dim.values) for dim in dimensions])
        if len(lengths) > 1:
            raise ValueError('Values for dimensions not of same length')
        single_dim = np.arange(lengths[0], dtype=INDICES_DTYPE)
        indices = np.tile(single_dim, (2, 1)).T
        values = np.dstack(tuple([dim.values for dim in dimensions])).squeeze()

        if is_spectral:
            indices = indices.T
            values = values.T
    else:
        raise NotImplementedError('Cannot yet work on Dependent dimensions')

    if verbose:
        print('Indices:')
        print(indices)
        print('Values:')
        print(values)

    # Create the Datasets for both Indices and Values
    h5_indices = h5_parent_group.create_dataset(base_name + 'Indices', data=INDICES_DTYPE(indices), dtype=INDICES_DTYPE)
    h5_values = h5_parent_group.create_dataset(base_name + 'Values', data=VALUES_DTYPE(values), dtype=VALUES_DTYPE)

    for h5_dset in [h5_indices, h5_values]:
        write_simple_attrs(h5_dset, {'units': [x.units for x in dimensions], 'labels': [x.name for x in dimensions],
                                     'type': [dim.mode.value for dim in dimensions]})

    warn('pyUSID.io.hdf_utils.simple.write_ind_val_dsets no longer creates'
         'region references for each dimension. Please use '
         'pyUSID.io.reg_ref.write_region_references to manually create region '
         'references')

    return h5_indices, h5_values
コード例 #19
0
 def name(self, value):
     self._name = validate_single_string_arg(value, 'name')
コード例 #20
0
 def units(self, value):
     self._units = validate_single_string_arg(value, 'units')
コード例 #21
0
ファイル: simple.py プロジェクト: rajgiriUW/pyUSID
def write_reduced_anc_dsets(h5_parent_group, h5_inds, h5_vals, dim_name, basename=None, is_spec=None,
                            verbose=False):
    """
    Creates new Ancillary Indices and Values datasets from the input datasets by dropping the specified dimensions

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group` or h5py.File
        Group under which the indices and values datasets will be created
    h5_inds : HDF5 Dataset
            Spectroscopic or Positions indices dataset
    h5_vals : HDF5 Dataset
            Spectroscopic or Positions values dataset
    dim_name : str or unicode or list of strings
            Names of the dimension(s) to remove
    basename : str or unicode, Optional
            String to which '_Indices' and '_Values' will be appended to get the names of the new datasets.
            Default = 'Position' or 'Spectroscopic'
    is_spec : bool, optional
            Whether or not the provided ancillary datasets are position or spectroscopic
            The user is recommended to supply this parameter whenever it is known or possible.
            By default, this function will attempt to recognize the answer based on the shape of the datasets.
    verbose : bool, optional. Default = False
            Whether or not to print debugging print statements

    Returns
    -------
    h5_inds_new : h5py.Dataset
            Reduced indices dataset
    h5_vals_new : h5py.Dataset
            Reduces values dataset

    """
    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should either be a h5py. Group or File object')

    for param, param_name in zip([h5_inds, h5_vals], ['h5_inds', 'h5_vals']):
        if not isinstance(param, h5py.Dataset):
            raise TypeError(param_name + ' should be a h5py.Dataset object')
    if dim_name is not None:
        dim_name = validate_list_of_strings(dim_name, 'dim_name')

    all_dim_names = list(get_attr(h5_inds, 'labels'))
    for item in dim_name:
        if item not in all_dim_names:
            raise KeyError('Requested dimension: {} not in the list of labels: {}'.format(item, all_dim_names))

    ind_mat = h5_inds[()]
    val_mat = h5_vals[()]

    if is_spec is None:
        # Attempt to recognize the type automatically
        is_spec = False
        if ind_mat.shape[0] == ind_mat.shape[1]:
            raise ValueError('Unable automatically guess whether the provided datasets are position or '
                             'spectroscopic. Please explicitely specify via the "is_spec" boolean kwarg')
        if ind_mat.shape[0] < ind_mat.shape[1]:
            is_spec = True
    else:
        if not isinstance(is_spec, bool):
            raise TypeError('is_spec should be a boolean. Provided object is of type: {}'.format(type(is_spec)))

    if basename is not None:
        basename = validate_single_string_arg(basename, 'basename')
        if basename.endswith('_'):
            basename = basename[:-1]
    else:
        if is_spec:
            basename = 'Spectroscopic'
        else:
            basename = 'Position'

    for sub_name in ['_Indices', '_Values']:
        if basename + sub_name in h5_parent_group.keys():
            raise KeyError('Dataset: {} already exists in provided group: {}'.format(basename + sub_name,
                                                                                     h5_parent_group.name))

    if set(dim_name) != set(all_dim_names):
        # At least one dimension will remain

        if verbose:
            print('All Dimensions: {}. Dimensions to be removed: {}'.format(all_dim_names, dim_name))

        if not is_spec:
            # Convert to spectral shape
            ind_mat = np.transpose(ind_mat)
            val_mat = np.transpose(val_mat)

        # For all dimensions, find where the index = 0
        # basically, we are indexing all dimensions to 0
        first_indices = []
        keep_dim = np.ones(len(all_dim_names), dtype=bool)
        for cur_dim in dim_name:
            dim_ind = all_dim_names.index(cur_dim)
            keep_dim[dim_ind] = False
            # check equality against the minimum value instead of 0 to account for cases when a dimension does not start
            # from 0 (already been sliced) - think of multi-dimensional slicing!
            first_indices.append(ind_mat[dim_ind] == np.min(ind_mat[dim_ind]))
        first_indices = np.vstack(first_indices)

        if verbose:
            print('Raw first_indices:')
            print(first_indices)
            print('Dimensions to keep: {}'.format(keep_dim))

        step_starts = np.all(first_indices, axis=0)

        if verbose:
            print('Columns in dataset to keep:')
            print(step_starts)

        '''
        Extract all rows that we want to keep from input indices and values
        '''
        # TODO: handle TypeError: Indexing elements must be in increasing order
        ind_mat = ind_mat[keep_dim, :][:, step_starts]
        val_mat = val_mat[keep_dim, :][:, step_starts]

        if not is_spec:
            # Convert back to position shape
            ind_mat = np.transpose(ind_mat)
            val_mat = np.transpose(val_mat)

        '''
        Create new Datasets to hold the data
        Name them based on basename
        '''
        h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=ind_mat, dtype=h5_inds.dtype)
        h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=val_mat, dtype=h5_vals.dtype)
        # Extracting the labels from the original spectroscopic data sets
        labels = h5_inds.attrs['labels'][keep_dim]
        # Creating the dimension slices for the new spectroscopic data sets

        # Adding the labels and units to the new spectroscopic data sets
        for dset in [h5_inds_new, h5_vals_new]:
            write_simple_attrs(dset, {'labels': labels, 'units': h5_inds.attrs['units'][keep_dim]})

    else:
        # Remove all dimensions:
        h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=np.array([[0]]), dtype=INDICES_DTYPE)
        h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=np.array([[0]]), dtype=VALUES_DTYPE)

        for dset in [h5_inds_new, h5_vals_new]:
            write_simple_attrs(dset, {'labels': ['Single_Step'], 'units': ['a. u.']})

    return h5_inds_new, h5_vals_new
コード例 #22
0
ファイル: hdf_utils.py プロジェクト: ziatdinovmax/sidpy
def copy_dataset(h5_orig_dset, h5_dest_grp, alias=None, verbose=False):
    """
    Copies the provided HDF5 dataset to the provided destination. This function
    is handy when needing to make copies of datasets to a different HDF5 file.
    Notes
    -----
    This function does NOT copy all linked objects such as ancillary
    datasets. Call `copy_linked_objects` to accomplish that goal.
    Parameters
    ----------
    h5_orig_dset : h5py.Dataset
    h5_dest_grp : h5py.Group or h5py.File object :
        Destination where the duplicate dataset will be created
    alias : str, optional. Default = name from `h5_orig_dset`:
        Name to be assigned to the copied dataset
    verbose : bool, optional. Default = False
        Whether or not to print logs to assist in debugging
    Returns
    -------
    """
    if not isinstance(h5_orig_dset, h5py.Dataset):
        raise TypeError("'h5_orig_dset' should be a h5py.Dataset object")
    if not isinstance(h5_dest_grp, (h5py.File, h5py.Group)):
        raise TypeError("'h5_dest_grp' should either be a h5py.File or "
                        "h5py.Group object")
    if alias is not None:
        validate_single_string_arg(alias, 'alias')
    else:
        alias = h5_orig_dset.name.split('/')[-1]

    if alias in h5_dest_grp.keys():
        if verbose:
            warn('{} already contains an object with the same name: {}'
                 ''.format(h5_dest_grp, alias))
        h5_new_dset = h5_dest_grp[alias]
        if not isinstance(h5_new_dset, h5py.Dataset):
            raise TypeError(
                '{} already contains an object: {} with the desired'
                ' name which is not a dataset'.format(h5_dest_grp,
                                                      h5_new_dset))

        da_source = lazy_load_array(h5_orig_dset)
        da_dest = lazy_load_array(h5_new_dset)

        if da_source.shape != da_dest.shape:
            raise ValueError('Existing dataset: {} has a different shape '
                             'compared to the original dataset: {}'
                             ''.format(h5_new_dset, h5_orig_dset))
        if not da.allclose(da_source, da_dest):
            raise ValueError('Existing dataset: {} has different contents'
                             'compared to the original dataset: {}'
                             ''.format(h5_new_dset, h5_orig_dset))
    else:

        kwargs = {
            'shape': h5_orig_dset.shape,
            'dtype': h5_orig_dset.dtype,
            'compression': h5_orig_dset.compression,
            'chunks': h5_orig_dset.chunks
        }
        if h5_orig_dset.file.driver == 'mpio':
            if kwargs.pop('compression', None) is not None:
                warn('This HDF5 file has been opened wth the '
                     '"mpio" communicator. mpi4py does not allow '
                     'creation of compressed datasets. Compression'
                     ' kwarg has been removed')
        if verbose:
            print('Creating new HDF5 dataset named: {} at: {} with'
                  ' kwargs: {}'.format(alias, h5_dest_grp, kwargs))
        h5_new_dset = h5_dest_grp.create_dataset(alias, **kwargs)
        if verbose:
            print('dask.array will copy data from source dataset '
                  'to new dataset')
        da.to_hdf5(h5_new_dset.file.filename,
                   {h5_new_dset.name: lazy_load_array(h5_orig_dset)})
    if verbose:
        print('Copying simple attributes of original dataset: {} to '
              'destination dataset: {}'.format(h5_orig_dset, h5_new_dset))

    copy_attributes(h5_orig_dset, h5_new_dset, skip_refs=True)
    # TODO: reinstate copy all region_refs()
    # copy_all_region_refs(h5_orig_dset, h5_new_dset)

    return h5_new_dset
コード例 #23
0
ファイル: simple.py プロジェクト: rajgiriUW/pyUSID
def check_for_old(h5_base, tool_name, new_parms=None, target_dset=None,
                  h5_parent_goup=None, verbose=False):
    """
    Check to see if the results of a tool already exist and if they
    were performed with the same parameters.

    Parameters
    ----------
    h5_base : h5py.Dataset object
           Dataset on which the tool is being applied to
    tool_name : str
           process or analysis name
    new_parms : dict, optional
           Parameters with which this tool will be performed.
    target_dset : str, optional, default = None
            Name of the dataset whose attributes will be compared against new_parms.
            Default - checking against the group
    h5_parent_goup : h5py.Group, optional. Default = None
            The group to search under. Use this option when `h5_base` and
            the potential results groups (within `h5_parent_goup` are located
            in different HDF5 files. Default - search within h5_base.parent
    verbose : bool, optional, default = False
           Whether or not to print debugging statements

    Returns
    -------
    group : list
           List of all :class:`h5py.Group` objects with parameters matching those in `new_parms`
    """
    if not isinstance(h5_base, h5py.Dataset):
        raise TypeError('h5_base should be a h5py.Dataset object')
    tool_name = validate_single_string_arg(tool_name, 'tool_name')

    if h5_parent_goup is not None:
        if not isinstance(h5_parent_goup, (h5py.File, h5py.Group)):
            raise TypeError("'h5_parent_group' should either be a h5py.File "
                            "or h5py.Group object")
    else:
        h5_parent_goup = h5_base.parent

    if new_parms is None:
        new_parms = dict()
    else:
        if not isinstance(new_parms, dict):
            raise TypeError('new_parms should be a dict')
    if target_dset is not None:
        target_dset = validate_single_string_arg(target_dset, 'target_dset')

    matching_groups = []
    groups = find_results_groups(h5_base, tool_name,
                                 h5_parent_group=h5_parent_goup)

    for group in groups:
        if verbose:
            print('Looking at group - {}'.format(group.name.split('/')[-1]))

        h5_obj = group
        if target_dset is not None:
            if target_dset in group.keys():
                h5_obj = group[target_dset]
            else:
                if verbose:
                    print('{} did not contain the target dataset: {}'.format(group.name.split('/')[-1],
                                                                             target_dset))
                continue

        if check_for_matching_attrs(h5_obj, new_parms=new_parms, verbose=verbose):
            # return group
            matching_groups.append(group)

    return matching_groups
コード例 #24
0
ファイル: simple.py プロジェクト: rajgiriUW/pyUSID
def create_empty_dataset(source_dset, dtype, dset_name, h5_group=None,
                         new_attrs=None, skip_refs=False):
    """
    Creates an empty dataset in the h5 file based on the provided dataset in
    the same or specified group

    Parameters
    ----------
    source_dset : h5py.Dataset object
        Source object that provides information on the group and shape of the dataset
    dtype : dtype
        Data type of the fit / guess datasets
    dset_name : String / Unicode
        Name of the dataset
    h5_group : :class:`h5py.Group`, optional. Default = None
        Group within which this dataset will be created
    new_attrs : dictionary (Optional)
        Any new attributes that need to be written to the dataset
    skip_refs : boolean, optional
        Should ObjectReferences be skipped when copying attributes from the
        `source_dset`

    Returns
    -------
    h5_new_dset : h5py.Dataset object
        Newly created dataset

    """
    if not isinstance(source_dset, h5py.Dataset):
        raise TypeError('source_deset should be a h5py.Dataset object')
    _ = validate_dtype(dtype)
    if new_attrs is not None:
        if not isinstance(new_attrs, dict):
            raise TypeError('new_attrs should be a dictionary')
    else:
        new_attrs = dict()

    if h5_group is None:
        h5_group = source_dset.parent
    else:
        if not isinstance(h5_group, (h5py.Group, h5py.File)):
            raise TypeError('h5_group should be a h5py.Group or h5py.File object')

        if source_dset.file != h5_group.file and not skip_refs:
            # Cannot carry over references
            warn('H5 object references will not be copied over since {} is in '
                 'a different HDF5 file as {}'.format(h5_group, source_dset))
            skip_refs = True

    dset_name = validate_single_string_arg(dset_name, 'dset_name')
    if '-' in dset_name:
        warn('dset_name should not contain the "-" character. Reformatted name from:{} to '
             '{}'.format(dset_name, dset_name.replace('-', '_')))
    dset_name = dset_name.replace('-', '_')

    kwargs = {'shape': source_dset.shape, 'dtype': dtype, 'compression': source_dset.compression,
              'chunks': source_dset.chunks}

    if source_dset.file.driver == 'mpio':
        if kwargs.pop('compression', None) is not None:
            warn('This HDF5 file has been opened wth the "mpio" communicator. '
                 'mpi4py does not allow creation of compressed datasets. Compression kwarg has been removed')

    if dset_name in h5_group.keys():
        if isinstance(h5_group[dset_name], h5py.Dataset):
            warn('A dataset named: {} already exists in group: {}'.format(dset_name, h5_group.name))
            h5_new_dset = h5_group[dset_name]
            # Make sure it has the correct shape and dtype
            if any((source_dset.shape != h5_new_dset.shape, dtype != h5_new_dset.dtype)):
                warn('Either the shape (existing: {} desired: {}) or dtype (existing: {} desired: {}) of the dataset '
                     'did not match with expectations. Deleting and creating a new one.'.format(h5_new_dset.shape,
                                                                                                source_dset.shape,
                                                                                                h5_new_dset.dtype,
                                                                                                dtype))
                del h5_new_dset, h5_group[dset_name]
                h5_new_dset = h5_group.create_dataset(dset_name, **kwargs)
        else:
            raise KeyError('{} is already a {} in group: {}'.format(dset_name, type(h5_group[dset_name]),
                                                                    h5_group.name))

    else:
        h5_new_dset = h5_group.create_dataset(dset_name, **kwargs)

    # This should link the ancillary datasets correctly
    h5_new_dset = hut.copy_attributes(source_dset, h5_new_dset,
                                  skip_refs=skip_refs)
    if source_dset.file != h5_group.file:
        hut.copy_linked_objects(source_dset, h5_new_dset)
    h5_new_dset.attrs.update(new_attrs)

    if check_if_main(h5_new_dset):
        from ..usi_data import USIDataset

        h5_new_dset = USIDataset(h5_new_dset)
        # update book keeping attributes
        write_book_keeping_attrs(h5_new_dset)

    return h5_new_dset