Exemple #1
0
    def setUp(self):
        data_utils.make_beps_file()
        self.orig_labels_order = ['X', 'Y', 'Cycle', 'Bias']
        self.h5_file = h5py.File(data_utils.std_beps_path, mode='r')

        h5_grp = self.h5_file['/Raw_Measurement/']
        self.source_nd_s2f = h5_grp['n_dim_form'][()]
        self.source_nd_f2s = self.source_nd_s2f.transpose(1, 0, 3, 2)
        self.h5_source = USIDataset(h5_grp['source_main'])

        self.pos_dims = []
        self.spec_dims = []

        for dim_name, dim_units in zip(
                self.h5_source.pos_dim_labels,
                get_attr(self.h5_source.h5_pos_inds, 'units')):
            self.pos_dims.append(
                Dimension(dim_name, dim_units, h5_grp[dim_name][()]))

        for dim_name, dim_units in zip(
                self.h5_source.spec_dim_labels,
                get_attr(self.h5_source.h5_spec_inds, 'units')):
            self.spec_dims.append(
                Dimension(dim_name, dim_units, h5_grp[dim_name][()]))

        res_grp_0 = h5_grp['source_main-Fitter_000']
        self.results_0_nd_s2f = res_grp_0['n_dim_form'][()]
        self.results_0_nd_f2s = self.results_0_nd_s2f.transpose(1, 0, 3, 2)
        self.h5_compound = USIDataset(res_grp_0['results_main'])

        res_grp_1 = h5_grp['source_main-Fitter_001']
        self.results_1_nd_s2f = res_grp_1['n_dim_form'][()]
        self.results_1_nd_f2s = self.results_1_nd_s2f.transpose(1, 0, 3, 2)
        self.h5_complex = USIDataset(res_grp_1['results_main'])
Exemple #2
0
def validate_anc_dset_attrs(h5_inds, h5_vals, is_spec=True):
    """
    Validates the attributes of a pair of indices and values datasets.
    Throws ValueErrors if any rule is not satisfied

    Parameters
    ----------
    h5_inds : h5py.Dataset
        Indices dataset
    h5_vals : h5py.Dataset
        Values Dataset
    is_spec : bool, optional. Default = True
        Set to True if spectroscopic. Else - Position datasets
    """
    def lists_match(left, right):
        if len(left) != len(right):
            return False
        return all([l_it == r_it for l_it, r_it in zip(left, right)])

    v_names = get_attr(h5_vals, 'labels')
    v_units = get_attr(h5_vals, 'units')
    i_names = get_attr(h5_inds, 'labels')
    i_units = get_attr(h5_inds, 'units')

    for names, units, dset_type in zip([v_names, i_names], [v_units, i_units],
                                       ['Values', 'Indices']):
        if len(names) != len(units):
            raise ValueError('Length of labels: {} and units: {} for the {} '
                             'dataset do not match'
                             ''.format(len(names), len(units), dset_type))
    for i_item, v_item, prop in zip([i_names, i_units], [v_names, v_units],
                                       ['labels', 'units']):
        if not lists_match(i_item, v_item):
            raise ValueError('The "{}" values of the Indices: {} and Values: '
                             '{} datasets do not match'.format(prop, i_item,
                                                               v_item))

    # Now check the rows / cols nums against size of any attr:
    if h5_inds.shape != h5_vals.shape:
        raise ValueError('Shape of Indices: {} and Values: {} datasets do '
                         'not match'.format(h5_inds.shape, h5_vals.shape))
    dim_ind = 1
    if is_spec:
        dim_ind = 0
    if h5_inds.shape[dim_ind] != len(v_names):
        raise ValueError('Length of mandatory attributes: {} did not match '
                         'dimension: {} of the ancillary dataset of shape: {}'
                         ''.format(len(v_names), dim_ind, h5_inds.shape))
Exemple #3
0
    def test_write_reg_ref_main_one_dim(self):
        file_path = 'test.h5'
        data_utils.delete_existing_file(file_path)
        data = np.random.rand(7)
        with h5py.File(file_path, mode='w') as h5_f:
            h5_dset = h5_f.create_dataset('Main', data=data)
            reg_refs = {
                'even_rows': (slice(0, None, 2)),
                'odd_rows': (slice(1, None, 2))
            }
            reg_ref.write_region_references(h5_dset,
                                            reg_refs,
                                            add_labels_attr=True)
            self.assertEqual(len(h5_dset.attrs), 1 + len(reg_refs))
            actual = get_attr(h5_dset, 'labels')
            self.assertTrue(
                np.all([
                    x == y for x, y in zip(actual, ['even_rows', 'odd_rows'])
                ]))

            expected_data = [data[0:None:2], data[1:None:2]]
            written_data = [
                h5_dset[h5_dset.attrs['even_rows']],
                h5_dset[h5_dset.attrs['odd_rows']]
            ]

            for exp, act in zip(expected_data, written_data):
                self.assertTrue(np.allclose(exp, act))

        os.remove(file_path)
Exemple #4
0
    def test_string_representation(self):
        usi_dset = self.h5_source
        h5_main = self.h5_file[usi_dset.name]
        actual = usi_dset.__repr__()
        actual = [line.strip() for line in actual.split("\n")]
        actual = [actual[line_ind] for line_ind in [0, 2, 4, 7, 8, 10, 11]]

        expected = list()
        expected.append(h5_main.__repr__())
        expected.append(h5_main.name)
        expected.append(
            get_attr(h5_main, "quantity") + " (" + get_attr(h5_main, "units") +
            ")")
        for h5_inds in [usi_dset.h5_pos_inds, usi_dset.h5_spec_inds]:
            for dim_name, dim_size in zip(get_attr(h5_inds, "labels"),
                                          get_dimensionality(h5_inds)):
                expected.append(dim_name + ' - size: ' + str(dim_size))
        self.assertTrue(np.all([x == y for x, y in zip(actual, expected)]))
Exemple #5
0
    def test_simple_region_ref_copy(self):
        # based on test_hdf_writer.test_write_legal_reg_ref_multi_dim_data()
        file_path = 'test.h5'
        data_utils.delete_existing_file(file_path)
        with h5py.File(file_path, mode='w') as h5_f:
            data = np.random.rand(5, 7)
            h5_orig_dset = h5_f.create_dataset('test', data=data)
            self.assertIsInstance(h5_orig_dset, h5py.Dataset)

            attrs = {
                'labels': {
                    'even_rows': (slice(0, None, 2), slice(None)),
                    'odd_rows': (slice(1, None, 2), slice(None))
                }
            }

            data_utils.write_main_reg_refs(h5_orig_dset, attrs['labels'])
            h5_f.flush()

            # two atts point to region references. one for labels
            self.assertEqual(len(h5_orig_dset.attrs), 1 + len(attrs['labels']))

            # check if the labels attribute was written:

            self.assertTrue(
                np.all([
                    x in list(attrs['labels'].keys())
                    for x in get_attr(h5_orig_dset, 'labels')
                ]))

            expected_data = [data[:None:2], data[1:None:2]]
            written_data = [
                h5_orig_dset[h5_orig_dset.attrs['even_rows']],
                h5_orig_dset[h5_orig_dset.attrs['odd_rows']]
            ]

            for exp, act in zip(expected_data, written_data):
                self.assertTrue(np.allclose(exp, act))

            # Now write a new dataset without the region reference:
            h5_new_dset = h5_f.create_dataset('other', data=data)
            self.assertIsInstance(h5_orig_dset, h5py.Dataset)
            h5_f.flush()

            for key in attrs['labels'].keys():
                reg_ref.simple_region_ref_copy(h5_orig_dset, h5_new_dset, key)

            # now check to make sure that this dataset also has the same region references:
            written_data = [
                h5_new_dset[h5_new_dset.attrs['even_rows']],
                h5_new_dset[h5_new_dset.attrs['odd_rows']]
            ]

            for exp, act in zip(expected_data, written_data):
                self.assertTrue(np.allclose(exp, act))

        os.remove(file_path)
Exemple #6
0
    def get_all_dimensions():
        pos_dims = []
        spec_dims = []
        with h5py.File(test_h5_file_path, mode='r') as h5_f:
            h5_raw_grp = h5_f['Raw_Measurement']
            usi_main = USIDataset(h5_raw_grp['source_main'])
            for dim_name, dim_units in zip(
                    usi_main.pos_dim_labels,
                    get_attr(usi_main.h5_pos_inds, 'units')):
                pos_dims.append(
                    Dimension(dim_name, dim_units, h5_raw_grp[dim_name][()]))

            for dim_name, dim_units in zip(
                    usi_main.spec_dim_labels,
                    get_attr(usi_main.h5_spec_inds, 'units')):
                spec_dims.append(
                    Dimension(dim_name, dim_units, h5_raw_grp[dim_name][()]))

        return pos_dims, spec_dims
Exemple #7
0
 def test_get_indices_for_region_ref_corners(self):
     with h5py.File(data_utils.std_beps_path, mode='r') as h5_f:
         h5_main = h5_f['/Raw_Measurement/source_main']
         ref_in = get_attr(h5_main, 'even_rows')
         ret_val = reg_ref.get_indices_for_region_ref(
             h5_main, ref_in, 'corners')
         expected_pos = np.repeat(np.arange(h5_main.shape[0])[::2], 2)
         expected_spec = np.tile(np.array([0, h5_main.shape[1] - 1]),
                                 expected_pos.size // 2)
         expected_corners = np.vstack((expected_pos, expected_spec)).T
         self.assertTrue(np.allclose(ret_val, expected_corners))
Exemple #8
0
 def test_get_indices_for_region_ref_slices(self):
     with h5py.File(data_utils.std_beps_path, mode='r') as h5_f:
         h5_main = h5_f['/Raw_Measurement/source_main']
         ref_in = get_attr(h5_main, 'even_rows')
         ret_val = reg_ref.get_indices_for_region_ref(
             h5_main, ref_in, 'slices')
         spec_slice = slice(0, h5_main.shape[1] - 1, None)
         expected_slices = np.array(
             [[slice(x, x, None), spec_slice]
              for x in np.arange(h5_main.shape[0])[::2]])
         self.assertTrue(np.all(ret_val == expected_slices))
Exemple #9
0
    def _write_results_chunk(self):
        """
        Writes the provided SVD results to file

        Parameters
        ----------
        """
        comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s))

        h5_svd_group = create_results_group(self.h5_main, self.process_name,
                                            h5_parent_group=self._h5_target_group)
        self.h5_results_grp = h5_svd_group
        self._write_source_dset_provenance()
        

        write_simple_attrs(h5_svd_group, self.parms_dict)
        write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'})

        h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim,
                                  h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals,
                                  dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize))
        # print(get_attr(self.h5_main, 'quantity')[0])
        h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0],
                                  'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds,
                                  h5_spec_vals=self.h5_main.h5_spec_vals,
                                  chunks=calc_chunks(self.__v.shape, self.h5_main.dtype.itemsize))

        # No point making this 1D dataset a main dataset
        h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s))

        '''
        Check h5_main for plot group references.
        Copy them into V if they exist
        '''
        for key in self.h5_main.attrs.keys():
            if '_Plot_Group' not in key:
                continue

            ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners')
            ref_inds = ref_inds.reshape([-1, 2, 2])
            ref_inds[:, 1, 0] = h5_v.shape[0] - 1

            svd_ref = create_region_reference(h5_v, ref_inds)

            h5_v.attrs[key] = svd_ref

        # Marking completion:
        self._status_dset_name = 'completed_positions'
        self._h5_status_dset = h5_svd_group.create_dataset(self._status_dset_name,
                                                           data=np.ones(self.h5_main.shape[0], dtype=np.uint8))
        # keeping legacy option:
        h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0]
Exemple #10
0
    def _create_guess_datasets(self):
        """
        Creates the h5 group, guess dataset, corresponding spectroscopic datasets and also
        links the guess dataset to the spectroscopic datasets.
        """
        self.h5_results_grp = create_results_group(
            self.h5_main,
            self.process_name,
            h5_parent_group=self._h5_target_group)
        write_simple_attrs(self.h5_results_grp, self.parms_dict)

        # If writing to a new HDF5 file:
        # Add back the data_type attribute - still being used in the visualizer
        if self.h5_results_grp.file != self.h5_main.file:
            write_simple_attrs(
                self.h5_results_grp.file,
                {'data_type': get_attr(self.h5_main.file, 'data_type')})

        ret_vals = write_reduced_anc_dsets(self.h5_results_grp,
                                           self.h5_main.h5_spec_inds,
                                           self.h5_main.h5_spec_vals,
                                           self._fit_dim_name,
                                           verbose=self.verbose)

        h5_sho_inds, h5_sho_vals = ret_vals

        self._h5_guess = write_main_dataset(
            self.h5_results_grp, (self.h5_main.shape[0], self.num_udvs_steps),
            'Guess',
            'SHO',
            'compound',
            None,
            None,
            h5_pos_inds=self.h5_main.h5_pos_inds,
            h5_pos_vals=self.h5_main.h5_pos_vals,
            h5_spec_inds=h5_sho_inds,
            h5_spec_vals=h5_sho_vals,
            chunks=(1, self.num_udvs_steps),
            dtype=sho32,
            main_dset_attrs=self.parms_dict,
            verbose=self.verbose)

        # Does not make sense to propagate region refs - nobody uses them
        # copy_region_refs(self.h5_main, self._h5_guess)

        self._h5_guess.file.flush()

        if self.verbose and self.mpi_rank == 0:
            print('Finished creating Guess dataset')
Exemple #11
0
def copy_main_attributes(h5_main, h5_new):
    """
    Copies the units and quantity name from one dataset to another

    Parameters
    ----------
    h5_main : h5py.Dataset
        Dataset containing the target attributes
    h5_new : h5py.Dataset
        Dataset to which the target attributes are to be copied

    """
    for param, param_name in zip([h5_main, h5_new], ['h5_main', 'h5_new']):
        if not isinstance(param, h5py.Dataset):
            raise TypeError(param_name + ' should be a h5py.Dataset object')

    for att_name in ['quantity', 'units']:
        if att_name not in h5_main.attrs:
            raise KeyError('Attribute: {} does not exist in {}'.format(att_name, h5_main))
        val = get_attr(h5_main, att_name)
        h5_new.attrs[att_name] = clean_string_att(val)
Exemple #12
0
def get_attr(h5_object, attr_name):
    """
    Returns the attribute from the h5py object

    Parameters
    ----------
    h5_object : :class:`h5py.Dataset`, :class:`h5py.Group` or :class:`h5py.File`
        object whose attribute is desired
    attr_name : str
        Name of the attribute of interest

    Returns
    -------
    att_val : object
        value of attribute, in certain cases (byte strings or list of byte strings) reformatted to readily usable forms

    """
    warn('pyUSID.io.hdf_utils.get_attr has been moved to '
         'sidpy.hdf.hdf_utils.get_attr. This copy in pyUSID will'
         'be removed in future release. Please update your import statements')
    return hut.get_attr(h5_object, attr_name)
Exemple #13
0
    def __init__(
        self,
        h5_ref,
    ):
        """
        Parameters
        ----------
        h5_ref : :class:`h5py.Dataset`
            The dataset which is actually a USID Main dataset
            This dataset has dhdf5 dimensional scales

        Methods
        -------

        self.slice
        self.data_descriptor():
            returns the label of the dataset
        self.get_dimension_labels():
            returns the labels of the dimensions
        self.get_dimens_types()
            returns dictionary of dimension_types (keys) with the axis numbers as values
        self.visualize(slice):
            not tested
            basic visualization of dataset based on dimension_types and slice (optional)
            returns fig and axis


        Attributes
        ----------
        self.data_type: str
            The data_type (supported are:  'image', 'image_stack',  'spectrum', 'linescan' and 'spectrum_image' )
        self.quantity: str
            The physical quantity represented in the dataset
        self.units: str
            The units of the dataset
        self.axes_units: list of str
            The units for the dimensional axes.
        self.axes_quantities: list of str
            The quantities (physical property) for the dimensional axes.
        self.dimension_types: list of str
            The dimension_types (supported is 'spatial', 'spectral', 'reciprocal' and 'time') for the dimensional axes.
        self.axes_first_pixels: list of int
            A list of the sizes of first pixel of each  dimension.

    """

        super(NSIDataset, self).__init__(h5_ref.id)

        self.data_type = get_attr(self, 'data_type')
        self.quantity = self.attrs['quantity']
        self.units = self.attrs['units']

        #self.axes_names = [dim.label for dim in h5_ref.dims]
        units = []
        quantities = []
        dimension_types = []
        pixel_sizes = []

        for dim in h5_ref.dims:
            units.append(get_attr(dim[0], 'units'))
            quantities.append(get_attr(dim[0], 'quantity'))
            dimension_types.append(get_attr(dim[0], 'dimension_type'))
            pixel_sizes.append(abs(dim[0][1] - dim[0][0]))
        self.axes_units = units
        self.axes_quantities = quantities
        self.dimension_types = dimension_types
        self.axes_first_pixels = pixel_sizes

        self.data_descriptor = '{} ({})'.format(get_attr(self, 'quantity'),
                                                get_attr(self, 'units'))
Exemple #14
0
def reshape_to_n_dims(h5_main,
                      h5_pos=None,
                      h5_spec=None,
                      get_labels=False,
                      verbose=False,
                      sort_dims=False,
                      lazy=False):
    """
    Reshape the input 2D matrix to be N-dimensions based on the
    position and spectroscopic datasets.

    Parameters
    ----------
    h5_main : HDF5 Dataset
        2D data to be reshaped
    h5_pos : HDF5 Dataset, optional
        Position indices corresponding to rows in `h5_main`
    h5_spec : HDF5 Dataset, optional
        Spectroscopic indices corresponding to columns in `h5_main`
    get_labels : bool, optional
        Whether or not to return the dimension labels.  Default False
    verbose : bool, optional
        Whether or not to print debugging statements
    sort_dims : bool
        If True, the data is sorted so that the dimensions are in order from slowest to fastest
        If False, the data is kept in the original order
        If `get_labels` is also True, the labels are sorted as well.
    lazy : bool, optional. Default = False
        If False, ds_Nd will be a numpy.ndarray object - this is suitable if the HDF5 dataset fits into memory
        If True, ds_Nd will be a dask.array object - This is suitable if the HDF5 dataset is too large to fit into
        memory. Note that this will bea lazy computation meaning that the returned object just contains the instructions
        . In order to get the actual value or content in numpy arrays, call ds_Nd.compute()

    Returns
    -------
    ds_Nd : N-D numpy array or dask.array object
        N dimensional array arranged as [positions slowest to fastest, spectroscopic slowest to fastest]
    success : boolean or string
        True if full reshape was successful

        "Positions" if it was only possible to reshape by
        the position dimensions

        False if no reshape was possible
    ds_labels : list of str
        List of the labels of each dimension of `ds_Nd`

    Notes
    -----
    If either `h5_pos` or `h5_spec` are not provided, the function will first
    attempt to find them as attributes of `h5_main`.  If that fails, it will
    generate dummy values for them.

    """
    # TODO: automatically switch on lazy if the data is larger than memory
    # TODO: sort_dims does not appear to do much. Functions as though it was always True

    if h5_pos is None and h5_spec is None:
        if not check_if_main(h5_main):
            raise ValueError(
                'if h5_main is a h5py.Dataset it should be a Main dataset')
    else:
        if not isinstance(h5_main, (h5py.Dataset, np.ndarray, da.core.Array)):
            raise TypeError(
                'h5_main should either be a h5py.Dataset or numpy array')

    if h5_pos is not None:
        if not isinstance(h5_pos, (h5py.Dataset, np.ndarray, da.core.Array)):
            raise TypeError(
                'h5_pos should either be a h5py.Dataset or numpy array')
        if h5_pos.shape[0] != h5_main.shape[0]:
            raise ValueError(
                'The size of h5_pos: {} does not match with h5_main: {}'.
                format(h5_pos.shape, h5_main.shape))

    if h5_spec is not None:
        if not isinstance(h5_spec, (h5py.Dataset, np.ndarray, da.core.Array)):
            raise TypeError(
                'h5_spec should either be a h5py.Dataset or numpy array')
        if h5_spec.shape[1] != h5_main.shape[1]:
            raise ValueError(
                'The size of h5_spec: {} does not match with h5_main: {}'.
                format(h5_spec.shape, h5_main.shape))

    pos_labs = np.array(['Positions'])
    spec_labs = np.array(['Spectral_Step'])
    if h5_pos is None:
        """
        Get the Position datasets from the references if possible
        """
        if isinstance(h5_main, h5py.Dataset):
            try:
                h5_pos = h5_main.file[h5_main.attrs['Position_Indices']]
                ds_pos = h5_pos[()]
                pos_labs = get_attr(h5_pos, 'labels')
            except KeyError:
                print('No position datasets found as attributes of {}'.format(
                    h5_main.name))
                if len(h5_main.shape) > 1:
                    ds_pos = np.arange(h5_main.shape[0],
                                       dtype=INDICES_DTYPE).reshape(-1, 1)
                    pos_labs = np.array([
                        'Position Dimension {}'.format(ipos)
                        for ipos in range(ds_pos.shape[1])
                    ])
                else:
                    ds_pos = np.array(0, dtype=INDICES_DTYPE).reshape(-1, 1)
        else:
            ds_pos = np.arange(h5_main.shape[0],
                               dtype=INDICES_DTYPE).reshape(-1, 1)
            pos_labs = np.array([
                'Position Dimension {}'.format(ipos)
                for ipos in range(ds_pos.shape[1])
            ])
    elif isinstance(h5_pos, h5py.Dataset):
        """
    Position Indices dataset was provided
        """
        ds_pos = h5_pos[()]
        pos_labs = get_attr(h5_pos, 'labels')
    elif isinstance(h5_pos, (np.ndarray, da.core.Array)):
        ds_pos = np.atleast_2d(h5_pos)
        pos_labs = np.array([
            'Position Dimension {}'.format(ipos)
            for ipos in range(ds_pos.shape[1])
        ])
    else:
        raise TypeError('Position Indices must be either h5py.Dataset or None')

    if h5_spec is None:
        """
        Get the Spectroscopic datasets from the references if possible
        """
        if isinstance(h5_main, h5py.Dataset):
            try:
                h5_spec = h5_main.file[h5_main.attrs['Spectroscopic_Indices']]
                ds_spec = h5_spec[()]
                spec_labs = get_attr(h5_spec, 'labels')
            except KeyError:
                print('No spectroscopic datasets found as attributes of {}'.
                      format(h5_main.name))
                if len(h5_main.shape) > 1:
                    ds_spec = np.arange(h5_main.shape[1],
                                        dtype=INDICES_DTYPE).reshape([1, -1])
                    spec_labs = np.array([
                        'Spectral Dimension {}'.format(ispec)
                        for ispec in range(ds_spec.shape[0])
                    ])
                else:
                    ds_spec = np.array(0, dtype=INDICES_DTYPE).reshape([1, 1])
        else:
            ds_spec = np.arange(h5_main.shape[1],
                                dtype=INDICES_DTYPE).reshape([1, -1])
            spec_labs = np.array([
                'Spectral Dimension {}'.format(ispec)
                for ispec in range(ds_spec.shape[0])
            ])

    elif isinstance(h5_spec, h5py.Dataset):
        """
    Spectroscopic Indices dataset was provided
        """
        ds_spec = h5_spec[()]
        spec_labs = get_attr(h5_spec, 'labels')
    elif isinstance(h5_spec, (np.ndarray, da.core.Array)):
        ds_spec = h5_spec
        spec_labs = np.array([
            'Spectral Dimension {}'.format(ispec)
            for ispec in range(ds_spec.shape[0])
        ])
    else:
        raise TypeError(
            'Spectroscopic Indices must be either h5py.Dataset or None')
    '''
    Sort the indices from fastest to slowest
    '''
    pos_sort = get_sort_order(np.transpose(ds_pos))
    spec_sort = get_sort_order(ds_spec)

    if verbose:
        print('Position dimensions:', pos_labs)
        print('Position sort order:', pos_sort)
        print('Spectroscopic Dimensions:', spec_labs)
        print('Spectroscopic sort order:', spec_sort)
    '''
    Get the size of each dimension in the sorted order
    '''
    pos_dims = get_dimensionality(np.transpose(ds_pos), pos_sort)
    spec_dims = get_dimensionality(ds_spec, spec_sort)

    if np.prod(pos_dims) != h5_main.shape[0]:
        mesg = 'Product of position dimension sizes: {} = {} not matching ' \
               'with size of first axis of main dataset: {}. One or more ' \
               'dimensions are dependent dimensions and not marked as such' \
               '.'.format(pos_dims, np.prod(pos_dims), h5_main.shape[0])
        raise ValueError(mesg)
    if np.prod(spec_dims) != h5_main.shape[1]:
        mesg = 'Product of spectroscopic dimension sizes: {} = {} not matching ' \
               'with size of second axis of main dataset: {}. One or more ' \
               'dimensions are dependent dimensions and not marked as such' \
               '.'.format(spec_dims, np.prod(spec_dims), h5_main.shape[1])
        raise ValueError(mesg)

    if verbose:
        print('\nPosition dimensions (sort applied):', pos_labs[pos_sort])
        print('Position dimensionality (sort applied):', pos_dims)
        print('Spectroscopic dimensions (sort applied):', spec_labs[spec_sort])
        print('Spectroscopic dimensionality (sort applied):', spec_dims)

    if lazy:
        ds_main = lazy_load_array(h5_main)
    else:
        ds_main = h5_main[()]
    """
    Now we reshape the dataset based on those dimensions
    numpy reshapes correctly when the dimensions are arranged from slowest to fastest. 
    Since the sort orders we have are from fastest to slowest, we need to reverse the orders
    for both the position and spectroscopic dimensions
    """
    if verbose:
        print('Will attempt to reshape main dataset from:\n{} to {}'.format(
            ds_main.shape, pos_dims[::-1] + spec_dims[::-1]))

    try:
        ds_Nd = ds_main.reshape(pos_dims[::-1] + spec_dims[::-1])

    except ValueError:
        warn(
            'Could not reshape dataset to full N-dimensional form.  Attempting reshape based on position only.'
        )
        try:
            ds_Nd = ds_main.reshape(pos_dims[::-1] + [-1])

        except ValueError:
            warn(
                'Reshape by position only also failed.  Will keep dataset in 2d form.'
            )
            if get_labels:
                return ds_main, False, ['Position', 'Spectral Step']
            else:
                return ds_main, False

        # No exception
        else:
            if get_labels:
                return ds_Nd, 'Positions', ['Position'] + spec_labs
            else:
                return ds_Nd, 'Positions'

    all_labels = np.hstack(
        (pos_labs[pos_sort][::-1], spec_labs[spec_sort][::-1]))

    if verbose:
        print('\nAfter reshaping, labels are', all_labels)
        print('Data shape is', ds_Nd.shape)
    """
    At this point, the data is arranged from slowest to fastest dimension in both pos and spec
    """
    if sort_dims:
        results = [ds_Nd, True]
        if get_labels:
            results.append(all_labels)
        return results

    if verbose:
        print(
            '\nGoing to put dimensions back in the same order as in the file:')

    swap_axes = list()
    # Compare the original order of the pos / spec labels with where these dimensions occur in the sorted labels
    for lab in pos_labs:
        swap_axes.append(np.argwhere(all_labels == lab).squeeze())
    for lab in spec_labs:
        swap_axes.append(np.argwhere(all_labels == lab).squeeze())

    swap_axes = np.array(swap_axes)

    if verbose:
        print('Axes will permuted in this order:', swap_axes)
        print('New labels ordering:', all_labels[swap_axes])

    ds_Nd = ds_Nd.transpose(tuple(swap_axes))
    results = [ds_Nd, True]

    if verbose:
        print('Dataset now of shape:', ds_Nd.shape)

    if get_labels:
        '''
        Get the labels in the proper order
        '''
        results.append(all_labels[swap_axes])

    return results
Exemple #15
0
def rebuild_svd(h5_main, components=None, cores=None, max_RAM_mb=1024):
    """
    Rebuild the Image from the SVD results on the windows
    Optionally, only use components less than n_comp.

    :param h5_main: dataset which SVD was performed on
    :type h5_main: hdf5 Dataset
    
    :param components: 
        Defines which components to keep
        Default - None, all components kept

        Input Types
        integer : Components less than the input will be kept
        length 2 iterable of integers : Integers define start and stop of component slice to retain
        other iterable of integers or slice : Selection of component indices to retain
    :type components: {int, iterable of int, slice} optional

    :param cores: How many cores should be used to rebuild
        Default - None, all but 2 cores will be used, min 1
    :type cores: int, optional
    
    :param max_RAM_mb: Maximum ammount of memory to use when rebuilding, in Mb.
        Default - 1024Mb
    :type max_RAM_mb: int, optional
    
    :raise: KeyError if SVD results not found 

    :returns: rebu dataset
    :rtype: HDF5 Dataset

    """

    if not isinstance(h5_main, USIDataset):
        h5_main = USIDataset(h5_main)

    comp_slice, num_comps = get_component_slice(
        components, total_components=h5_main.shape[1])
    if isinstance(comp_slice, np.ndarray):
        comp_slice = list(comp_slice)
    dset_name = h5_main.name.split('/')[-1]

    # Ensuring that at least one core is available for use / 2 cores are available for other use
    max_cores = max(1, cpu_count() - 2)
    #         print('max_cores',max_cores)
    if cores is not None:
        cores = min(round(abs(cores)), max_cores)
    else:
        cores = max_cores

    max_memory = min(max_RAM_mb * 1024**2, 0.75 * get_available_memory())
    if cores != 1:
        max_memory = int(max_memory / 2)
    '''
    Get the handles for the SVD results
    '''
    try:
        h5_svd_group = find_results_groups(h5_main, 'SVD')[-1]

        h5_S = h5_svd_group['S']
        h5_U = h5_svd_group['U']
        h5_V = h5_svd_group['V']

    except KeyError:
        raise KeyError(
            'SVD Results for {dset} were not found.'.format(dset=dset_name))
    except:
        raise

    func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_V)
    '''
    Calculate the size of a single batch that will fit in the available memory
    '''
    n_comps = h5_S[comp_slice].size
    mem_per_pix = (h5_U.dtype.itemsize +
                   h5_V.dtype.itemsize * h5_V.shape[1]) * n_comps
    fixed_mem = h5_main.size * h5_main.dtype.itemsize

    if cores is None:
        free_mem = max_memory - fixed_mem
    else:
        free_mem = max_memory * 2 - fixed_mem

    batch_size = int(round(float(free_mem) / mem_per_pix))

    if batch_size < 0:
        print('Batches listed were negative', batch_size)
        batch_size = 100

    batch_slices = gen_batches(h5_U.shape[0], batch_size)

    print('Reconstructing in batches of {} positions.'.format(batch_size))
    print('Batches should be {} Mb each.'.format(mem_per_pix * batch_size /
                                                 1024.0**2))
    '''
    Loop over all batches.
    '''
    ds_V = np.dot(np.diag(h5_S[comp_slice]), func(h5_V[comp_slice, :]))
    rebuild = np.zeros((h5_main.shape[0], ds_V.shape[1]))
    for ibatch, batch in enumerate(batch_slices):
        rebuild[batch, :] += np.dot(h5_U[batch, comp_slice], ds_V)

    rebuild = stack_real_to_target_dtype(rebuild, h5_V.dtype)

    print(
        'Completed reconstruction of data from SVD results.  Writing to file.')
    '''
    Create the Group and dataset to hold the rebuild data
    '''
    rebuilt_grp = create_indexed_group(h5_svd_group, 'Rebuilt_Data')
    h5_rebuilt = write_main_dataset(rebuilt_grp,
                                    rebuild,
                                    'Rebuilt_Data',
                                    get_attr(h5_main, 'quantity'),
                                    get_attr(h5_main, 'units'),
                                    None,
                                    None,
                                    h5_pos_inds=h5_main.h5_pos_inds,
                                    h5_pos_vals=h5_main.h5_pos_vals,
                                    h5_spec_inds=h5_main.h5_spec_inds,
                                    h5_spec_vals=h5_main.h5_spec_vals,
                                    chunks=h5_main.chunks,
                                    compression=h5_main.compression)

    if isinstance(comp_slice, slice):
        rebuilt_grp.attrs['components_used'] = '{}-{}'.format(
            comp_slice.start, comp_slice.stop)
    else:
        rebuilt_grp.attrs['components_used'] = components

    copy_attributes(h5_main, h5_rebuilt, skip_refs=False)

    h5_main.file.flush()

    print('Done writing reconstructed data to file.')

    return h5_rebuilt
    def _write_results_chunk(self):
        """
        Writes the labels and mean response to the h5 file

        Returns
        ---------
        h5_group : HDF5 Group reference
            Reference to the group that contains the decomposition results
        """

        self.h5_results_grp = create_results_group(
            self.h5_main,
            self.process_name,
            h5_parent_group=self._h5_target_group)
        self._write_source_dset_provenance()
        write_simple_attrs(self.h5_results_grp, self.parms_dict)
        write_simple_attrs(
            self.h5_results_grp, {
                'n_components': self.__components.shape[0],
                'n_samples': self.h5_main.shape[0]
            })

        decomp_desc = Dimension('Endmember', 'a. u.',
                                self.__components.shape[0])

        # equivalent to V - compound / complex
        h5_components = write_main_dataset(
            self.h5_results_grp,
            self.__components,
            'Components',
            get_attr(self.h5_main, 'quantity')[0],
            'a.u.',
            decomp_desc,
            None,
            h5_spec_inds=self.h5_main.h5_spec_inds,
            h5_spec_vals=self.h5_main.h5_spec_vals)

        # equivalent of U - real
        h5_projections = write_main_dataset(
            self.h5_results_grp,
            np.float32(self.__projection),
            'Projection',
            'abundance',
            'a.u.',
            None,
            decomp_desc,
            dtype=np.float32,
            h5_pos_inds=self.h5_main.h5_pos_inds,
            h5_pos_vals=self.h5_main.h5_pos_vals)

        # return the h5 group object
        self.h5_results_grp = self.h5_results_grp

        # Marking completion:
        self._status_dset_name = 'completed_positions'
        self._h5_status_dset = self.h5_results_grp.create_dataset(
            self._status_dset_name,
            data=np.ones(self.h5_main.shape[0], dtype=np.uint8))
        # keeping legacy option:
        self.h5_results_grp.attrs['last_pixel'] = self.h5_main.shape[0]

        return self.h5_results_grp
Exemple #17
0
def get_unit_values(ds_inds,
                    ds_vals,
                    dim_names=None,
                    all_dim_names=None,
                    is_spec=None,
                    verbose=False):
    """
    Gets the unit arrays of values that describe the spectroscopic dimensions

    Parameters
    ----------
    ds_inds : h5py.Dataset or numpy.ndarray
        Spectroscopic or Position Indices dataset
    ds_vals : h5py.Dataset or numpy.ndarray
        Spectroscopic or Position Values dataset
    dim_names : str, or list of str, Optional
        Names of the dimensions of interest. Default = all
    all_dim_names : list of str, Optional
        Names of all the dimensions in these datasets. Use this if supplying numpy arrays instead of h5py.Dataset
        objects for h5_inds, h5_vals since there is no other way of getting the dimension names.
    is_spec : bool, optional
        Whether or not the provided ancillary datasets are position or spectroscopic
        The user is recommended to supply this parameter whenever it is known
        By default, this function will attempt to recognize the answer based on the shape of the datasets.
    verbose : bool, optional
        Whether or not to print debugging statements. Default - off

    Note - this function can be extended / modified for ancillary position dimensions as well

    Returns
    -------
    unit_values : dict
        Dictionary containing the unit array for each dimension. The name of the dimensions are the keys.

    """
    if all_dim_names is None:
        allowed_types = h5py.Dataset
    else:
        all_dim_names = validate_list_of_strings(all_dim_names,
                                                 'all_dim_names')
        all_dim_names = np.array(all_dim_names)
        allowed_types = (h5py.Dataset, np.ndarray)

    for dset, dset_name in zip([ds_inds, ds_vals], ['ds_inds', 'ds_vals']):
        if not isinstance(dset, allowed_types):
            raise TypeError(dset_name +
                            ' should be of type: {}'.format(allowed_types))

    # For now, we will throw an error if even a single dimension is listed as an incomplete dimension:
    if isinstance(ds_inds, h5py.Dataset):
        if np.any([
                'incomplete_dimensions' in dset.attrs.keys()
                for dset in [ds_inds, ds_vals]
        ]):
            try:
                incomp_dims_inds = get_attr(ds_inds, 'incomplete_dimensions')
            except KeyError:
                incomp_dims_inds = None
            try:
                incomp_dims_vals = get_attr(ds_vals, 'incomplete_dimensions')
            except KeyError:
                incomp_dims_vals = None
            if incomp_dims_inds is None and incomp_dims_vals is not None:
                incomp_dims = incomp_dims_vals
            elif incomp_dims_inds is not None and incomp_dims_vals is None:
                incomp_dims = incomp_dims_inds
            else:
                # ensure that both attributes are the same
                if incomp_dims_vals != incomp_dims_inds:
                    raise ValueError(
                        'Provided indices ({}) and values ({}) datasets were marked with different values '
                        'for incomplete_datasets.'.format(
                            incomp_dims_inds, incomp_dims_vals))
                incomp_dims = incomp_dims_vals

            all_dim_names = get_attr(ds_inds, 'labels')
            raise ValueError(
                'Among all dimensions: {}, These dimensions were marked as incomplete dimensions: {}'
                '. You are recommended to find unit values manually'.format(
                    all_dim_names, incomp_dims))

    # Do we need to check that the provided inds and vals correspond to the same main dataset?
    if ds_inds.shape != ds_vals.shape:
        raise ValueError(
            'h5_inds: {} and h5_vals: {} should have the same shapes'.format(
                ds_inds.shape, ds_vals.shape))

    if all_dim_names is None:
        all_dim_names = get_attr(ds_inds, 'labels')
    if verbose:
        print('All dimensions: {}'.format(all_dim_names))

    # First load to memory
    inds_mat = ds_inds[()]
    vals_mat = ds_vals[()]

    if is_spec is None:
        # Attempt to recognize the type automatically
        is_spec = False
        if inds_mat.shape[0] < inds_mat.shape[1]:
            is_spec = True
    else:
        if not isinstance(is_spec, bool):
            raise TypeError(
                'is_spec should be a boolean. Provided object is of type: {}'.
                format(type(is_spec)))

    if verbose:
        print(
            'Ancillary matrices of shape: {}, hence determined to be Spectroscopic:{}'
            .format(inds_mat.shape, is_spec))

    if not is_spec:
        # Convert to spectral shape
        inds_mat = np.transpose(inds_mat)
        vals_mat = np.transpose(vals_mat)

    if len(all_dim_names) != inds_mat.shape[0]:
        raise ValueError(
            'Length of dimension names list: {} not matching with shape of dataset: {}'
            '.'.format(len(all_dim_names), inds_mat.shape[0]))

    if dim_names is None:
        dim_names = all_dim_names
        if verbose:
            print('Going to return unit values for all dimensions: {}'.format(
                all_dim_names))
    else:
        dim_names = validate_list_of_strings(dim_names, 'dim_names')

        if verbose:
            print(
                'Checking to make sure that the target dimension names: {} exist in the datasets attributes: {}'
                '.'.format(dim_names, all_dim_names))

        # check to make sure that the dimension names exist in the datasets:
        for dim_name in dim_names:
            if dim_name not in all_dim_names:
                raise KeyError(
                    'Dimension {} does not exist in the provided ancillary datasets'
                    .format(dim_name))

    unit_values = dict()
    for dim_name in all_dim_names:
        # Find the row in the spectroscopic indices that corresponds to the dimensions we want to slice:
        if verbose:
            print('Looking for dimension: {} in {}'.format(
                dim_name, dim_names))
        desired_row_ind = np.where(all_dim_names == dim_name)[0][0]

        inds_for_dim = inds_mat[desired_row_ind]
        # Wherever this dimension goes to 0 - start of a new tile
        starts = np.where(inds_for_dim == np.min(inds_for_dim))[0]
        if starts[0] != 0:
            raise ValueError('Spectroscopic Indices for dimension: "{}" not '
                             'starting with 0. Please fix this and try again'
                             '.'.format(dim_name))

        # There may be repetitions in addition to tiling. Find how the the positions increase.
        # 1 = repetition, > 1 = new tile
        step_sizes = np.hstack(([1], np.diff(starts)))
        # This array is of the same length as the full indices array

        # We should expect only two values of step sizes for a regular dimension (tiles of the same size):
        # 1 for same value repeating and a big jump in indices when the next tile starts
        # If the repeats / tiles are of different lengths, then this is not a regular dimension.
        # What does a Unit Values vector even mean in this case? Just raise an error for now
        if np.where(np.unique(step_sizes) - 1)[0].size > 1:
            raise ValueError('Non constant step sizes')

        # Finding Start of a new tile
        tile_starts = np.where(step_sizes > 1)[0]

        # converting these indices to correct indices that can be mapped straight to
        if len(tile_starts) < 1:
            # Dimension(s) with no tiling at all
            # Make it look as though the next tile starts at the end of the whole indices vector
            tile_starts = np.array([0, len(inds_for_dim)])
        else:
            # Dimension with some form of repetition
            tile_starts = np.hstack(([0], starts[tile_starts]))

            # Verify that each tile is identical here
            # Last tile will not be checked unless we add the length of the indices vector as the start of next tile
            tile_starts = np.hstack((tile_starts, [len(inds_for_dim)]))
            subsections = [
                inds_for_dim[tile_starts[ind]:tile_starts[ind + 1]]
                for ind in range(len(tile_starts) - 1)
            ]
            if np.max(np.diff(subsections, axis=0)) != 0:
                # Should get unit values for ALL dimensions regardless of expectations to catch such scenarios.
                raise ValueError(
                    'Values in each tile of dimension: {} are different'.
                    format(dim_name))

        # Now looking within the first tile:
        subsection = inds_for_dim[tile_starts[0]:tile_starts[1]]
        # remove all repetitions. ie - take indices only where jump == 1
        step_inds = np.hstack(
            ([0], np.where(np.hstack(([0], np.diff(subsection))))[0]))
        # Finally, use these indices to get the values
        if dim_name in dim_names:
            # Only add this dimension to dictionary if requwested.
            unit_values[dim_name] = vals_mat[desired_row_ind, step_inds]

    return unit_values
Exemple #18
0
def validate_aux_dset_pair(test_class,
                           h5_group,
                           h5_inds,
                           h5_vals,
                           dim_names,
                           dim_units,
                           inds_matrix,
                           vals_matrix=None,
                           base_name=None,
                           h5_main=None,
                           is_spectral=True,
                           slow_to_fast=False,
                           check_reg_refs=False):
    if vals_matrix is None:
        vals_matrix = inds_matrix
    if base_name is None:
        if is_spectral:
            base_name = 'Spectroscopic'
        else:
            base_name = 'Position'
    else:
        test_class.assertIsInstance(base_name, (str, unicode))

    if not slow_to_fast:
        # Sending in to Fast to Slow but what comes out is slow to fast
        func = np.flipud if is_spectral else np.fliplr

        print(inds_matrix)

        vals_matrix = func(vals_matrix)
        inds_matrix = func(inds_matrix)

        dim_names = dim_names[::-1]
        dim_units = dim_units[::-1]

    for h5_dset, exp_dtype, exp_name, ref_data in zip(
        [h5_inds, h5_vals], [INDICES_DTYPE, VALUES_DTYPE],
        [base_name + '_Indices', base_name + '_Values'],
        [inds_matrix, vals_matrix]):
        if isinstance(h5_main, h5py.Dataset):
            test_class.assertEqual(h5_main.file[h5_main.attrs[exp_name]],
                                   h5_dset)
        test_class.assertIsInstance(h5_dset, h5py.Dataset)
        test_class.assertEqual(h5_dset.parent, h5_group)
        test_class.assertEqual(h5_dset.name.split('/')[-1], exp_name)
        test_class.assertTrue(np.allclose(ref_data, h5_dset[()]))
        test_class.assertEqual(h5_dset.dtype, exp_dtype)
        test_class.assertTrue(
            np.all([_ in h5_dset.attrs.keys() for _ in ['labels', 'units']]))
        test_class.assertTrue(
            np.all([
                x == y for x, y in zip(dim_names, get_attr(h5_dset, 'labels'))
            ]))
        test_class.assertTrue(
            np.all([
                x == y for x, y in zip(dim_units, get_attr(h5_dset, 'units'))
            ]))

        # assert region references even though these are not used anywhere:
        if check_reg_refs:
            for dim_ind, curr_name in enumerate(dim_names):
                if is_spectral:
                    expected = np.squeeze(ref_data[dim_ind])
                else:
                    expected = np.squeeze(ref_data[:, dim_ind])
                actual = np.squeeze(h5_dset[h5_dset.attrs[curr_name]])
                try:
                    match = np.allclose(expected, actual)
                except ValueError:
                    match = False
                if match:
                    test_class.assertTrue(match)
                else:
                    warn('Test for region reference: ' + curr_name + ' failed')
Exemple #19
0
def write_reduced_anc_dsets(h5_parent_group, h5_inds, h5_vals, dim_name, basename=None, is_spec=None,
                            verbose=False):
    """
    Creates new Ancillary Indices and Values datasets from the input datasets by dropping the specified dimensions

    Parameters
    ----------
    h5_parent_group : :class:`h5py.Group` or h5py.File
        Group under which the indices and values datasets will be created
    h5_inds : HDF5 Dataset
            Spectroscopic or Positions indices dataset
    h5_vals : HDF5 Dataset
            Spectroscopic or Positions values dataset
    dim_name : str or unicode or list of strings
            Names of the dimension(s) to remove
    basename : str or unicode, Optional
            String to which '_Indices' and '_Values' will be appended to get the names of the new datasets.
            Default = 'Position' or 'Spectroscopic'
    is_spec : bool, optional
            Whether or not the provided ancillary datasets are position or spectroscopic
            The user is recommended to supply this parameter whenever it is known or possible.
            By default, this function will attempt to recognize the answer based on the shape of the datasets.
    verbose : bool, optional. Default = False
            Whether or not to print debugging print statements

    Returns
    -------
    h5_inds_new : h5py.Dataset
            Reduced indices dataset
    h5_vals_new : h5py.Dataset
            Reduces values dataset

    """
    if not isinstance(h5_parent_group, (h5py.Group, h5py.File)):
        raise TypeError('h5_parent_group should either be a h5py. Group or File object')

    for param, param_name in zip([h5_inds, h5_vals], ['h5_inds', 'h5_vals']):
        if not isinstance(param, h5py.Dataset):
            raise TypeError(param_name + ' should be a h5py.Dataset object')
    if dim_name is not None:
        dim_name = validate_list_of_strings(dim_name, 'dim_name')

    all_dim_names = list(get_attr(h5_inds, 'labels'))
    for item in dim_name:
        if item not in all_dim_names:
            raise KeyError('Requested dimension: {} not in the list of labels: {}'.format(item, all_dim_names))

    ind_mat = h5_inds[()]
    val_mat = h5_vals[()]

    if is_spec is None:
        # Attempt to recognize the type automatically
        is_spec = False
        if ind_mat.shape[0] == ind_mat.shape[1]:
            raise ValueError('Unable automatically guess whether the provided datasets are position or '
                             'spectroscopic. Please explicitely specify via the "is_spec" boolean kwarg')
        if ind_mat.shape[0] < ind_mat.shape[1]:
            is_spec = True
    else:
        if not isinstance(is_spec, bool):
            raise TypeError('is_spec should be a boolean. Provided object is of type: {}'.format(type(is_spec)))

    if basename is not None:
        basename = validate_single_string_arg(basename, 'basename')
        if basename.endswith('_'):
            basename = basename[:-1]
    else:
        if is_spec:
            basename = 'Spectroscopic'
        else:
            basename = 'Position'

    for sub_name in ['_Indices', '_Values']:
        if basename + sub_name in h5_parent_group.keys():
            raise KeyError('Dataset: {} already exists in provided group: {}'.format(basename + sub_name,
                                                                                     h5_parent_group.name))

    if set(dim_name) != set(all_dim_names):
        # At least one dimension will remain

        if verbose:
            print('All Dimensions: {}. Dimensions to be removed: {}'.format(all_dim_names, dim_name))

        if not is_spec:
            # Convert to spectral shape
            ind_mat = np.transpose(ind_mat)
            val_mat = np.transpose(val_mat)

        # For all dimensions, find where the index = 0
        # basically, we are indexing all dimensions to 0
        first_indices = []
        keep_dim = np.ones(len(all_dim_names), dtype=bool)
        for cur_dim in dim_name:
            dim_ind = all_dim_names.index(cur_dim)
            keep_dim[dim_ind] = False
            # check equality against the minimum value instead of 0 to account for cases when a dimension does not start
            # from 0 (already been sliced) - think of multi-dimensional slicing!
            first_indices.append(ind_mat[dim_ind] == np.min(ind_mat[dim_ind]))
        first_indices = np.vstack(first_indices)

        if verbose:
            print('Raw first_indices:')
            print(first_indices)
            print('Dimensions to keep: {}'.format(keep_dim))

        step_starts = np.all(first_indices, axis=0)

        if verbose:
            print('Columns in dataset to keep:')
            print(step_starts)

        '''
        Extract all rows that we want to keep from input indices and values
        '''
        # TODO: handle TypeError: Indexing elements must be in increasing order
        ind_mat = ind_mat[keep_dim, :][:, step_starts]
        val_mat = val_mat[keep_dim, :][:, step_starts]

        if not is_spec:
            # Convert back to position shape
            ind_mat = np.transpose(ind_mat)
            val_mat = np.transpose(val_mat)

        '''
        Create new Datasets to hold the data
        Name them based on basename
        '''
        h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=ind_mat, dtype=h5_inds.dtype)
        h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=val_mat, dtype=h5_vals.dtype)
        # Extracting the labels from the original spectroscopic data sets
        labels = h5_inds.attrs['labels'][keep_dim]
        # Creating the dimension slices for the new spectroscopic data sets

        # Adding the labels and units to the new spectroscopic data sets
        for dset in [h5_inds_new, h5_vals_new]:
            write_simple_attrs(dset, {'labels': labels, 'units': h5_inds.attrs['units'][keep_dim]})

    else:
        # Remove all dimensions:
        h5_inds_new = h5_parent_group.create_dataset(basename + '_Indices', data=np.array([[0]]), dtype=INDICES_DTYPE)
        h5_vals_new = h5_parent_group.create_dataset(basename + '_Values', data=np.array([[0]]), dtype=VALUES_DTYPE)

        for dset in [h5_inds_new, h5_vals_new]:
            write_simple_attrs(dset, {'labels': ['Single_Step'], 'units': ['a. u.']})

    return h5_inds_new, h5_vals_new
Exemple #20
0
def check_if_main(h5_main, verbose=False):
    """
    Checks the input dataset to see if it has all the necessary
    features to be considered a Main dataset.  This means it is
    2D and has the following attributes:

    * Position_Indices
    * Position_Values
    * Spectroscopic_Indices
    * Spectroscopic_Values
    * quantity
    * units

    In addition, the shapes of the ancillary matrices should match with that of
    h5_main

    Parameters
    ----------
    h5_main : HDF5 Dataset
        Dataset of interest
    verbose : Boolean (Optional. Default = False)
        Whether or not to print statements

    Returns
    -------
    success : Boolean
        True if all tests pass

    """
    try:
        validate_main_dset(h5_main, True)
    except Exception as exep:
        if verbose:
            print(exep)
        return False

    h5_name = h5_main.name.split('/')[-1]

    success = True

    # Check for Datasets
    dset_names = ['Position_Indices', 'Position_Values',
                  'Spectroscopic_Indices', 'Spectroscopic_Values']
    for name in dset_names:
        try:
            h5_anc_dset = h5_main.file[h5_main.attrs[name]]
            success = np.all([success, isinstance(h5_anc_dset, h5py.Dataset)])
        except:
            if verbose:
                print('{} not found as an attribute of {}.'.format(name, h5_name))
            return False

    attr_success = np.all([att in h5_main.attrs for att in ['quantity', 'units']])
    if not attr_success:
        if verbose:
            print('{} does not have the mandatory "quantity" and "units" attributes'.format(h5_main.name))
        return False

    for attr_name in ['quantity', 'units']:
        val = get_attr(h5_main, attr_name)
        if not isinstance(val, (str, unicode)):
            if verbose:
                print('Attribute {} of {} found to be {}. Expected a string'.format(attr_name, h5_main.name, val))
            return False

    # Blindly linking four datasets is still not sufficient. The sizes need to match:
    anc_shape_match = list()
    h5_pos_inds = h5_main.file[h5_main.attrs['Position_Indices']]
    h5_pos_vals = h5_main.file[h5_main.attrs['Position_Values']]
    anc_shape_match.append(np.all(h5_pos_vals.shape == h5_pos_inds.shape))
    for anc_dset in [h5_pos_vals, h5_pos_inds]:
        anc_shape_match.append(np.all(h5_main.shape[0] == anc_dset.shape[0]))
    if not np.all(anc_shape_match):
        if verbose:
            print('The shapes of the Position indices:{}, values:{} datasets did not match with that of the main '
                  'dataset: {}'.format(h5_pos_inds.shape, h5_pos_vals.shape, h5_main.shape))
        return False

    anc_shape_match = list()
    h5_spec_inds = h5_main.file[h5_main.attrs['Spectroscopic_Indices']]
    h5_spec_vals = h5_main.file[h5_main.attrs['Spectroscopic_Values']]
    anc_shape_match.append(np.all(h5_spec_inds.shape == h5_spec_vals.shape))
    for anc_dset in [h5_spec_inds, h5_spec_vals]:
        anc_shape_match.append(np.all(h5_main.shape[1] == anc_dset.shape[1]))
    if not np.all(anc_shape_match):
        if verbose:
            print('The shapes of the Spectroscopic indices:{}, values:{} datasets did not match with that of the main '
                  'dataset: {}'.format(h5_spec_inds.shape, h5_spec_vals.shape, h5_main.shape))
        return False

    try:
        validate_anc_dset_attrs(h5_pos_inds, h5_pos_vals, is_spec=False)
    except ValueError:
        if verbose:
            print('Attributes of Position datasets did not match')
        return False
    try:
        validate_anc_dset_attrs(h5_spec_inds, h5_spec_vals, is_spec=True)
    except ValueError:
        if verbose:
            print('Attributes of Spectroscopic datasets did not match')
        return False

    return success
Exemple #21
0
    def translate(self, h5_path, force_patch=False, **kwargs):
        """
        Add the needed references and attributes to the h5 file that are not created by the
        LabView data aquisition program.

        Parameters
        ----------
        h5_path : str
            path to the h5 file
        force_patch : bool, optional
            Should the check to see if the file has already been patched be ignored.
            Default False.

        Returns
        -------
        h5_file : str
            path to the patched dataset

        """
        #TODO: Need a way to choose which channels to apply the patcher to,
        #fails for multi-channel files where not all files are capable of being main datasets
        # Open the file and check if a patch is needed
        h5_file = h5py.File(os.path.abspath(h5_path), 'r+')
        if h5_file.attrs.get('translator') is not None and not force_patch:
            print('File is already Pycroscopy ready.')
            h5_file.close()
            return h5_path
        '''
        Get the list of all Raw_Data Datasets
        Loop over the list and update the needed attributes
        '''
        raw_list = find_dataset(h5_file, 'Raw_Data')
        for h5_raw in raw_list:
            if 'quantity' not in h5_raw.attrs:
                h5_raw.attrs['quantity'] = 'quantity'
            if 'units' not in h5_raw.attrs:
                h5_raw.attrs['units'] = 'a.u.'

            # Grab the channel and measurement group of the data to check some needed attributes
            h5_chan = h5_raw.parent
            try:
                c_type = get_attr(h5_chan, 'channel_type')

            except KeyError:
                warn_str = "'channel_type' was not found as an attribute of {}.\n".format(
                    h5_chan.name)
                warn_str += "If this is BEPS or BELine data from the LabView aquisition software, " + \
                            "please run the following piece of code.  Afterwards, run this function again.\n" + \
                            "CODE: " \
                            "hdf.file['{}'].attrs['channel_type'] = 'BE'".format(h5_chan.name)
                warn(warn_str)
                h5_file.close()
                return h5_path

            except:
                raise

            if c_type != 'BE':
                continue

            h5_meas = h5_chan.parent
            h5_meas.attrs['num_UDVS_steps'] = h5_meas.attrs['num_steps']

            # Get the object handles for the Indices and Values datasets
            h5_pos_inds = h5_chan['Position_Indices']
            h5_pos_vals = h5_chan['Position_Values']
            h5_spec_inds = h5_chan['Spectroscopic_Indices']
            h5_spec_vals = h5_chan['Spectroscopic_Values']

            # Make sure we have correct spectroscopic indices for the given values
            ds_spec_inds = create_spec_inds_from_vals(h5_spec_vals[()])
            if not np.allclose(ds_spec_inds, h5_spec_inds[()]):
                h5_spec_inds[:, :] = ds_spec_inds[:, :]
                h5_file.flush()

            # Get the labels and units for the Spectroscopic datasets
            h5_spec_labels = h5_spec_inds.attrs['labels']
            inds_and_vals = [
                h5_pos_inds, h5_pos_vals, h5_spec_inds, h5_spec_vals
            ]
            for dset in inds_and_vals:
                spec_labels = dset.attrs['labels']
                try:
                    spec_units = dset.attrs['units']

                    if len(spec_units) != len(spec_labels):
                        raise KeyError

                except KeyError:
                    dset['units'] = ['' for _ in spec_labels]
                except:
                    raise
            """"
            In early versions, too many spectroscopic dimension labels and 
            units were listed compared to the number of rows. Remove here:
            """
            remove_non_exist_spec_dim_labs(h5_spec_inds,
                                           h5_spec_vals,
                                           h5_meas,
                                           verbose=False)
            """
            Add back some standard metadata to be consistent with older
            BE data
            """
            missing_metadata = dict()
            if 'File_file_name' not in h5_meas.attrs.keys():
                missing_metadata['File_file_name'] = os.path.split(
                    h5_raw.file.filename)[-1].replace('.h5', '')
            if 'File_date_and_time' not in h5_meas.attrs.keys():
                try:
                    date_str = get_attr(h5_raw.file, 'date_string')
                    time_str = get_attr(h5_raw.file, 'time_string')
                    full_str = date_str.strip() + ' ' + time_str.strip()
                    """
                    convert:
                        date_string : 2018-12-05
                        time_string : 3:41:45 PM
                    to: 
                        File_date_and_time: 19-Jun-2009 18:44:56
                    """
                    try:
                        dt_obj = datetime.datetime.strptime(
                            full_str, "%Y-%m-%d %I:%M:%S %p")
                        missing_metadata[
                            'File_date_and_time'] = dt_obj.strftime(
                                '%d-%b-%Y %H:%M:%S')
                    except ValueError:
                        pass
                except KeyError:
                    pass
            # Now write to measurement group:
            if len(missing_metadata) > 0:
                write_simple_attrs(h5_meas, missing_metadata)

            # Link the references to the Indices and Values datasets to the Raw_Data
            print(h5_raw.shape, h5_pos_vals.shape, h5_spec_vals.shape)
            print(h5_spec_inds.shape, h5_pos_inds.shape)

            link_as_main(h5_raw, h5_pos_inds, h5_pos_vals, h5_spec_inds,
                         h5_spec_vals)

            # Also link the Bin_Frequencies and Bin_Wfm_Type datasets
            h5_freqs = h5_chan['Bin_Frequencies']
            aux_dset_names = ['Bin_Frequencies']
            aux_dset_refs = [h5_freqs.ref]
            check_and_link_ancillary(h5_raw,
                                     aux_dset_names,
                                     anc_refs=aux_dset_refs)
            '''
            Get all SHO_Fit groups for the Raw_Data and loop over them
            Get the Guess and Spectroscopic Datasets for each SHO_Fit group
            '''
            sho_list = find_results_groups(h5_raw, 'SHO_Fit')
            for h5_sho in sho_list:
                h5_sho_guess = h5_sho['Guess']
                h5_sho_spec_inds = h5_sho['Spectroscopic_Indices']
                h5_sho_spec_vals = h5_sho['Spectroscopic_Values']

                # Make sure we have correct spectroscopic indices for the given values
                ds_sho_spec_inds = create_spec_inds_from_vals(
                    h5_sho_spec_inds[()])
                if not np.allclose(ds_sho_spec_inds, h5_sho_spec_inds[()]):
                    h5_sho_spec_inds[:, :] = ds_sho_spec_inds[:, :]

                # Get the labels and units for the Spectroscopic datasets
                h5_sho_spec_labels = get_attr(h5_sho_spec_inds, 'labels')
                h5_sho_spec_units = get_attr(h5_sho_spec_vals, 'units')
                if h5_sho_spec_inds.shape[-1] != h5_sho_guess.shape[-1]:
                    print(
                        'Warning! Found incorrect spectral dimension for dataset {}. Attempting a fix.'
                        .format(h5_sho_guess))
                    try:
                        h5_sho_spec_inds = h5_sho_guess.parent.create_dataset(
                            "h5_sho_spec_inds_fixed",
                            shape=(1, 1),
                            dtype='uint32')
                        h5_sho_spec_inds.attrs['labels'] = 'labels'
                        h5_sho_spec_inds.attrs['units'] = 'units'
                    except RuntimeError:
                        print(
                            "It seems that the file has already been patched."
                            " Will use previously computed ancilliary datasets"
                        )
                        h5_sho_spec_inds = h5_sho_guess.parent[
                            'h5_sho_spec_inds_fixed']
                    try:
                        h5_sho_spec_vals = h5_sho_guess.parent.create_dataset(
                            "h5_sho_spec_vals_fixed",
                            shape=(1, 1),
                            dtype='uint32')
                        h5_sho_spec_vals[:] = 0
                        h5_sho_spec_vals.attrs['labels'] = 'labels'
                        h5_sho_spec_vals.attrs['units'] = 'units'
                    except RuntimeError:
                        print(
                            "It seems that the file has already been patched."
                            " Will use previously computed ancilliary datasets"
                        )

                        h5_sho_spec_vals = h5_sho_guess.parent[
                            'h5_sho_spec_vals_fixed2']

                link_as_main(h5_sho_guess, h5_pos_inds, h5_pos_vals,
                             h5_sho_spec_inds, h5_sho_spec_vals)
                sho_inds_and_vals = [h5_sho_spec_inds, h5_sho_spec_vals]

                for dset in sho_inds_and_vals:
                    spec_labels = get_attr(dset, 'labels')
                    try:
                        spec_units = get_attr(dset, 'units')

                        if len(spec_units) != len(spec_labels):
                            raise KeyError

                    except KeyError:
                        spec_units = [''.encode('utf-8') for _ in spec_labels]
                        dset.attrs['units'] = spec_units

                    except:
                        raise

            h5_file.flush()

        h5_file.attrs['translator'] = 'V3patcher'.encode('utf-8')

        h5_file.close()

        return h5_path
Exemple #22
0
def check_for_matching_attrs(h5_obj, new_parms=None, verbose=False):
    """
    Compares attributes in the given H5 object against those in the provided dictionary and returns True if
    the parameters match, and False otherwise

    Parameters
    ----------
    h5_obj : h5py object (Dataset or :class:`h5py.Group`)
        Object whose attributes will be compared against new_parms
    new_parms : dict, optional. default = empty dictionary
        Parameters to compare against the attributes present in h5_obj
    verbose : bool, optional, default = False
       Whether or not to print debugging statements

    Returns
    -------
    tests: bool
        Whether or not all paramters in new_parms matched with those in h5_obj's attributes

    """
    if not isinstance(h5_obj, (h5py.Dataset, h5py.Group, h5py.File)):
        raise TypeError('h5_obj should be a h5py.Dataset, h5py.Group, or h5py.File object')
    if new_parms is None:
        new_parms = dict()
    else:
        if not isinstance(new_parms, dict):
            raise TypeError('new_parms should be a dictionary')

    tests = []
    for key in new_parms.keys():

        if verbose:
            print('Looking for new attribute named: {}'.format(key))

        # HDF5 cannot store None as an attribute anyway. ignore
        if new_parms[key] is None:
            continue

        try:
            old_value = get_attr(h5_obj, key)
        except KeyError:
            # if parameter was not found assume that something has changed
            if verbose:
                print('New parm: {} \t- new parm not in group *****'.format(key))
            tests.append(False)
            break

        if isinstance(old_value, np.ndarray):
            if not isinstance(new_parms[key], collections.Iterable):
                if verbose:
                    print('New parm: {} \t- new parm not iterable unlike old parm *****'.format(key))
                tests.append(False)
                break
            new_array = np.array(new_parms[key])
            if old_value.size != new_array.size:
                if verbose:
                    print('New parm: {} \t- are of different sizes ****'.format(key))
                tests.append(False)
            else:
                try:
                    answer = np.allclose(old_value, new_array)
                except TypeError:
                    # comes here when comparing string arrays
                    # Not sure of a better way
                    answer = []
                    for old_val, new_val in zip(old_value, new_array):
                        answer.append(old_val == new_val)
                    answer = np.all(answer)
                if verbose:
                    print('New parm: {} \t- match: {}'.format(key, answer))
                tests.append(answer)
        else:
            """if isinstance(new_parms[key], collections.Iterable):
                if verbose:
                    print('New parm: {} \t- new parm is iterable unlike old parm *****'.format(key))
                tests.append(False)
                break"""
            answer = np.all(new_parms[key] == old_value)
            if verbose:
                print('New parm: {} \t- match: {}'.format(key, answer))
            tests.append(answer)
    if verbose:
        print('')

    return all(tests)
Exemple #23
0
    def _write_results_chunk(self):
        """
        Writes the labels and mean response to the h5 file

        Returns
        ---------
        h5_group : HDF5 Group reference
            Reference to the group that contains the clustering results
        """
        print('Writing clustering results to file.')
        num_clusters = self.__mean_resp.shape[0]

        self.h5_results_grp = create_results_group(
            self.h5_main,
            self.process_name,
            h5_parent_group=self._h5_target_group)
        self._write_source_dset_provenance()

        write_simple_attrs(self.h5_results_grp, self.parms_dict)

        h5_labels = write_main_dataset(self.h5_results_grp,
                                       np.uint32(self.__labels.reshape([-1,
                                                                        1])),
                                       'Labels',
                                       'Cluster ID',
                                       'a. u.',
                                       None,
                                       Dimension('Cluster', 'ID', 1),
                                       h5_pos_inds=self.h5_main.h5_pos_inds,
                                       h5_pos_vals=self.h5_main.h5_pos_vals,
                                       aux_spec_prefix='Cluster_',
                                       dtype=np.uint32)

        if self.num_comps != self.h5_main.shape[1]:
            '''
            Setup the Spectroscopic Indices and Values for the Mean Response if we didn't use all components
            Note that a sliced spectroscopic matrix may not be contiguous. Let's just lose the spectroscopic data
            for now until a better method is figured out
            '''
            """
            if isinstance(self.data_slice[1], np.ndarray):
                centroid_vals_mat = h5_centroids.h5_spec_vals[self.data_slice[1].tolist()]

            else:
                centroid_vals_mat = h5_centroids.h5_spec_vals[self.data_slice[1]]

            ds_centroid_values.data[0, :] = centroid_vals_mat
            """
            if isinstance(self.data_slice[1], np.ndarray):
                vals_slice = self.data_slice[1].tolist()
            else:
                vals_slice = self.data_slice[1]
            vals = self.h5_main.h5_spec_vals[:, vals_slice].squeeze()
            new_spec = Dimension('Original_Spectral_Index', 'a.u.', vals)
            h5_inds, h5_vals = write_ind_val_dsets(self.h5_results_grp,
                                                   new_spec,
                                                   is_spectral=True)

        else:
            h5_inds = self.h5_main.h5_spec_inds
            h5_vals = self.h5_main.h5_spec_vals

        # For now, link centroids with default spectroscopic indices and values.
        h5_centroids = write_main_dataset(self.h5_results_grp,
                                          self.__mean_resp,
                                          'Mean_Response',
                                          get_attr(self.h5_main,
                                                   'quantity')[0],
                                          get_attr(self.h5_main, 'units')[0],
                                          Dimension('Cluster', 'a. u.',
                                                    np.arange(num_clusters)),
                                          None,
                                          h5_spec_inds=h5_inds,
                                          aux_pos_prefix='Mean_Resp_Pos_',
                                          h5_spec_vals=h5_vals)

        # Marking completion:
        self._status_dset_name = 'completed_positions'
        self._h5_status_dset = self.h5_results_grp.create_dataset(
            self._status_dset_name,
            data=np.ones(self.h5_main.shape[0], dtype=np.uint8))
        # keeping legacy option:
        self.h5_results_grp.attrs['last_pixel'] = self.h5_main.shape[0]

        return self.h5_results_grp
def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None):
    """
    Breaks up the provided raw G-mode dataset into lines and pixels (from just lines)

    Parameters
    ----------
    h5_main : h5py.Dataset object
        Reference to the main dataset that contains the raw data that is only broken up by lines
    pts_per_cycle : unsigned int
        Number of points in a single pixel
    scan_step_x_m : float
        Step in meters for pixels

    Returns
    -------
    h5_resh : h5py.Dataset object
        Reference to the main dataset that contains the reshaped data
    """
    if not check_if_main(h5_main):
        raise TypeError('h5_main is not a Main dataset')
    h5_main = USIDataset(h5_main)
    if pts_per_cycle % 1 != 0 or pts_per_cycle < 1:
        raise TypeError('pts_per_cycle should be a positive integer')
    if scan_step_x_m is not None:
        if not isinstance(scan_step_x_m, Number):
            raise TypeError('scan_step_x_m should be a real number')
    else:
        scan_step_x_m = 1

    if h5_main.shape[1] % pts_per_cycle != 0:
        warn(
            'Error in reshaping the provided dataset to pixels. Check points per pixel'
        )
        raise ValueError

    num_cols = int(h5_main.shape[1] / pts_per_cycle)

    # TODO: DO NOT assume simple 1 spectral dimension!
    single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle])

    spec_dims = Dimension(
        get_attr(h5_main.h5_spec_vals, 'labels')[0],
        get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao)

    # TODO: DO NOT assume simple 1D in positions!
    pos_dims = [
        Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)),
        Dimension('Y', 'm',
                  np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0]))
    ]

    h5_group = create_results_group(h5_main, 'Reshape')
    # TODO: Create empty datasets and then write for very large datasets
    h5_resh = write_main_dataset(h5_group,
                                 (num_cols * h5_main.shape[0], pts_per_cycle),
                                 'Reshaped_Data',
                                 get_attr(h5_main, 'quantity')[0],
                                 get_attr(h5_main, 'units')[0],
                                 pos_dims,
                                 spec_dims,
                                 chunks=(10, pts_per_cycle),
                                 dtype=h5_main.dtype,
                                 compression=h5_main.compression)

    # TODO: DON'T write in one shot assuming small datasets fit in memory!
    print('Starting to reshape G-mode line data. Please be patient')
    h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle))

    print('Finished reshaping G-mode line data to rows and columns')

    return USIDataset(h5_resh)
Exemple #25
0
def plot_cluster_h5_group(h5_group, labels_kwargs=None, centroids_kwargs=None):
    """
    Plots the cluster labels and mean response for each cluster

    Parameters
    ----------
    h5_group : h5py.Datagroup object
        H5 group containing the labels and mean response
    labels_kwargs : dict, optional
        keyword arguments for the labels plot. NOT enabled yet.
    centroids_kwargs : dict, optional
        keyword arguments for the centroids plot. NOT enabled yet.

    Returns
    -------
    fig_labels : figure handle
        Figure containing the labels
    fig_centroids : figure handle
        Figure containing the centroids
    """
    if not isinstance(h5_group, h5py.Group):
        raise TypeError('h5_group should be a h5py.Group')
    h5_labels = USIDataset(h5_group['Labels'])
    h5_centroids = USIDataset(h5_group['Mean_Response'])

    labels_mat = np.squeeze(h5_labels.get_n_dim_form())
    if labels_mat.ndim > 3:
        print('Unable to visualize 4 or more dimensional labels!')
    if labels_mat.ndim == 1:
        fig_labs, axis_labs = plt.subplots(figsize=(5.5, 5))
        axis_labs.plot(h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]),
                       labels_mat)
        axis_labs.set_xlabel(h5_labels.pos_dim_descriptors[0])
        axis_labs.set_ylabel('Cluster index')
        axis_labs.set_title(
            get_attr(h5_group, 'cluster_algorithm') + ' Labels')
    elif labels_mat.ndim == 2:
        fig_labs, axis_labs = plot_cluster_labels(
            labels_mat,
            num_clusters=h5_centroids.shape[0],
            x_label=h5_labels.pos_dim_descriptors[0],
            y_label=h5_labels.pos_dim_descriptors[1],
            x_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]),
            y_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[1]),
            title=get_attr(h5_group, 'cluster_algorithm') + ' Labels')

    # TODO: probably not a great idea to load the entire dataset to memory
    centroids_mat = h5_centroids.get_n_dim_form()
    if len(h5_centroids.spec_dim_labels) == 1:
        legend_mode = 2
        if h5_centroids.shape[0] < 6:
            legend_mode = 1
        fig_cent, axis_cent = plot_cluster_centroids(
            centroids_mat,
            h5_centroids.get_spec_values(h5_centroids.spec_dim_labels[0]),
            legend_mode=legend_mode,
            x_label=h5_centroids.spec_dim_descriptors[0],
            y_label=h5_centroids.data_descriptor,
            overlayed=h5_centroids.shape[0] < 6,
            title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid',
            amp_units=get_attr(h5_centroids, 'units'))
    elif len(h5_centroids.spec_dim_labels) == 2:
        # stack of spectrograms
        if h5_centroids.dtype in [np.complex64, np.complex128, np.complex]:
            fig_cent, axis_cent = plot_complex_spectra(
                centroids_mat,
                subtitle_prefix='Cluster',
                title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid',
                x_label=h5_centroids.spec_dim_descriptors[0],
                y_label=h5_centroids.spec_dim_descriptors[1],
                amp_units=get_attr(h5_centroids, 'units'))
        else:
            fig_cent, axis_cent = plot_map_stack(
                centroids_mat,
                color_bar_mode='each',
                evenly_spaced=True,
                title='Cluster',
                heading=get_attr(h5_group, 'cluster_algorithm') + ' Centroid')
    return fig_labs, fig_cent
Exemple #26
0
def check_if_main(h5_main, verbose=False):
    """
    Checks the input dataset to see if it has all the necessary
    features to be considered a Main dataset.  This means it is
    dataset has dimensions of correct size and has the following attributes:
    * quantity
    * units
    * main_data_name
    * data_type
    * modality
    * source
    In addition, the shapes of the ancillary matrices should match with that of
    h5_main
    Parameters
    ----------
    h5_main : HDF5 Dataset
        Dataset of interest
    verbose : Boolean (Optional. Default = False)
        Whether or not to print statements
    Returns
    -------
    success : Boolean
        True if all tests pass
    """
    if not isinstance(h5_main, h5py.Dataset):
        if verbose:
            print('{} is not an HDF5 Dataset object.'.format(h5_main))
        return False

    number_of_dims = 0
    for dim in h5_main.dims:
        if np.array(dim.values()).size > 0:
            number_of_dims += 1

    if len(h5_main.shape) != number_of_dims:
        if verbose:
            print('Main data does not have full set of dimension scales. '
                  'Provided object has shape: {} but only {} dimensional '
                  'scales'.format(h5_main.shape, len(h5_main.dims)))
        return False

    # h5_name = h5_main.name.split('/')[-1]
    h5_group = h5_main.parent

    # success = True

    # Check for Datasets

    attrs_names = ['dimension_type', 'name', 'quantity', 'units']

    # Check for all required attributes in dataset
    main_attrs_names = [
        'quantity', 'units', 'main_data_name', 'pyNSID_version', 'data_type',
        'modality', 'source'
    ]
    main_attr_success = np.all(
        [att in h5_main.attrs for att in main_attrs_names])
    if verbose:
        print('All Attributes in dataset: ', main_attr_success)
    if not main_attr_success:
        if verbose:
            print('{} does not have the mandatory attributes'.format(
                h5_main.name))
        return False

    for attr_name in main_attrs_names:
        val = get_attr(h5_main, attr_name)
        if not isinstance(val, (str, unicode)):
            if verbose:
                print('Attribute {} of {} found to be {}. Expected a string'.
                      format(attr_name, h5_main.name, val))
            return False

    length_success = []
    dset_success = []
    attr_success = []
    # Check for Validity of Dimensional Scales
    for i, dimension in enumerate(h5_main.dims):
        # check for all required attributes
        h5_dim_dset = h5_group[dimension.label]

        attr_success.append(
            np.all([att in h5_dim_dset.attrs for att in attrs_names]))
        dset_success.append(
            np.all([attr_success,
                    isinstance(h5_dim_dset, h5py.Dataset)]))
        # dimensional scale has to be 1D
        if len(h5_dim_dset.shape) == 1:
            # and of the same length as the shape of the dataset
            length_success.append(h5_main.shape[i] == h5_dim_dset.shape[0])
        else:
            length_success.append(False)
    # We have the list now and can get error messages according to which dataset is bad or not.
    if np.all(
        [np.all(attr_success),
         np.all(length_success),
         np.all(dset_success)]):
        if verbose:
            print('Dimensions: All Attributes: ', np.all(attr_success))
            print('Dimensions: All Correct Length: ', np.all(length_success))
            print('Dimensions: All h5 Datasets: ', np.all(dset_success))
    else:
        if False in length_success:
            print('length of dimension scale {} is wrong'.format(
                length_success.index(False)))
        if False in attr_success:
            print('attributes in dimension scale {} are wrong'.format(
                attr_success.index(False)))
        if False in dset_success:
            print('dimension scale {} is not a dataset'.format(
                dset_success.index(False)))
        return False

    return main_attr_success