Example #1
0
    def test(self, override=False):
        """
        Decomposes the hdf5 dataset to calculate the components and projection. This function does NOT write results to
        the hdf5 file. Call :meth:`~pycroscopy.processing.Decomposition.compute()` to  write to the file. Handles
        complex, compound datasets such that the
        components are of the same data-type as the input matrix.

        Parameters
        ----------
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
        components : :class:`numpy.ndarray`
            Components
        projections : :class:`numpy.ndarray`
            Projections
        """
        if not override:
            if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return USIDataset(self.h5_results_grp['Components']).get_n_dim_form(), \
                       USIDataset(self.h5_results_grp['Projection']).get_n_dim_form()

        self.h5_results_grp = None

        print('Performing Decomposition on {}.'.format(self.h5_main.name))

        t0 = time.time()
        self._fit()
        self._transform()
        print('Took {} to compute {}'.format(format_time(time.time() - t0), self.method_name))

        self.__components = stack_real_to_target_dtype(self.estimator.components_, self.h5_main.dtype)
        projection_mat, success = reshape_to_n_dims(self.__projection, h5_pos=self.h5_main.h5_pos_inds,
                                                    h5_spec=np.expand_dims(np.arange(self.__projection.shape[1]),
                                                                           axis=0))
        if not success:
            raise ValueError('Could not reshape projections to N-Dimensional dataset! Error:' + success)

        components_mat, success = reshape_to_n_dims(self.__components, h5_spec=self.h5_main.h5_spec_inds,
                                                    h5_pos=np.expand_dims(np.arange(self.__components.shape[0]),
                                                                          axis=1))

        if not success:
            raise ValueError('Could not reshape components to N-Dimensional dataset! Error:' + success)

        return components_mat, projection_mat
Example #2
0
    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.
        
        h5_target_group : h5py.Group, optional. Default = None
            Location where to look for existing results and to place newly
            computed results. Use this kwarg if the results need to be written
            to a different HDF5 file. By default, this value is set to the
            parent group containing `h5_main`
            
        :param h5_main: USID Main HDF5 dataset that will be decomposed
        :type h5_main: :class:`pyUSID.USIDataset` object
        
        :param num_components: Number of components to decompose h5_main into.  Default None.
        :type num_components: int, optional
        
        :param kwargs: Arguments to be sent to Process
        :type kwargs:
            
        """
        super(SVD, self).__init__(h5_main, 'SVD', **kwargs)
        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(
            h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None
Example #3
0
    def __init__(self, h5_main, num_components=None):

        super(SVD, self).__init__(h5_main)
        self.process_name = 'SVD'
        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(
            h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)
        self.num_components = num_components
        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None
Example #4
0
    def test_check_for_old_guess_incomplete(self):
        self.fitter._fitter_name = 'Fitter'
        # Set last_pixel to less than number of positions
        write_simple_attrs(self.h5_guess, {'last_pixel': np.random.randint(self.h5_guess.shape[0]-1)})

        partial, completed = self.fitter._check_for_old_guess()

        self.assertEqual(USIDataset(partial[0]), self.h5_guess)
        self.assertEqual(completed, [])
Example #5
0
    def test_check_for_old_guess_complete(self):
        self.fitter._fitter_name = 'Fitter'
        # Set last_pixel to number of positions
        write_simple_attrs(self.h5_guess, {'last_pixel': self.h5_guess.shape[0]})

        partial, completed = self.fitter._check_for_old_guess()

        self.assertEqual(partial, [])
        self.assertEqual(USIDataset(completed[0]), self.h5_guess)
Example #6
0
    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.

        Parameters
        ----------
        h5_main : USIDataset
            Dataset to be decomposed.
        num_components : int, optional
            Number of components to decompose h5_main into.  Default None.
        kwargs
            Arguments to be sent to Process
        """
        super(SVD, self).__init__(h5_main, **kwargs)
        self.process_name = 'SVD'
        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(
            h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None
Example #7
0
    def __init__(self, h5_main, estimator, **kwargs):
        """
        Constructs the Decomposition object. Call the :meth:`~pycroscopy.processing.Decomposition.test()` and
        :meth:`~pycroscopy.processing.Decomposition.compute()` methods to run the decomposition
        
        Parameters
        ------------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset with embedded ancillary spectroscopic, position indices and values datasets
        estimator : :module:`sklearn.decomposition` object
            configured decomposition object to apply to the data
        h5_target_group : h5py.Group, optional. Default = None
            Location where to look for existing results and to place newly
            computed results. Use this kwarg if the results need to be written
            to a different HDF5 file. By default, this value is set to the
            parent group containing `h5_main`
        """
        
        allowed_methods = [dec.factor_analysis.FactorAnalysis,
                           dec.fastica_.FastICA,
                           dec.incremental_pca.IncrementalPCA,
                           dec.sparse_pca.MiniBatchSparsePCA,
                           dec.nmf.NMF,
                           dec.pca.PCA,
                           dec.sparse_pca.SparsePCA,
                           dec.truncated_svd.TruncatedSVD]
        
        # Store the decomposition object
        self.estimator = estimator
        
        # could not find a nicer way to extract the method name yet
        self.method_name = str(estimator)[:str(estimator).index('(')]

        if type(estimator) not in allowed_methods:
            raise NotImplementedError('Cannot work with {} yet'.format(self.method_name))
            
        # Done with decomposition-related checks, now call super init
        super(Decomposition, self).__init__(h5_main, 'Decomposition', **kwargs)
        
        # set up parameters
        self.parms_dict = {'decomposition_algorithm':self.method_name}
        self.parms_dict.update(self.estimator.get_params())
        
        # check for existing datagroups with same results
        # Partial groups don't make any sense for statistical learning algorithms....
        self.duplicate_h5_groups, self.h5_partial_groups = self._check_for_duplicates()

        # figure out the operation that needs need to be performed to convert to real scalar
        (self.data_transform_func, self.data_is_complex, self.data_is_compound,
         self.data_n_features, self.data_type_mult) = check_dtype(h5_main)

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)
        
        self.__components = None
        self.__projection = None
Example #8
0
    def __init__(self,
                 h5_main,
                 variables=['Frequency'],
                 parallel=True,
                 verbose=False):
        """
        For now, we assume that the guess dataset has not been generated for this dataset but we will relax this
        requirement after testing the basic components.

        Parameters
        ----------
        h5_main : h5py.Dataset instance
            The dataset over which the analysis will be performed. This dataset should be linked to the spectroscopic
            indices and values, and position indices and values datasets.
        variables : list(string), Default ['Frequency']
            Lists of attributes that h5_main should possess so that it may be analyzed by Model.
        parallel : bool, optional
            Should the parallel implementation of the fitting be used.  Default True
        verbose : bool, optional. default = False
            Whether or not to print statements that aid in debugging

        """

        if not isinstance(h5_main, USIDataset):
            h5_main = USIDataset(h5_main)

        # Checking if dataset has the proper dimensions for the model to run.
        if self._is_legal(h5_main, variables):
            self.h5_main = h5_main

        else:
            raise ValueError(
                'Provided dataset is not a "Main" dataset with necessary ancillary datasets'
            )

        # Checking if parallel processing will be used
        self._parallel = parallel
        self._verbose = verbose

        # Determining the max size of the data that can be put into memory
        self._set_memory_and_cores()

        self._start_pos = 0
        self._end_pos = self.h5_main.shape[0]
        self.h5_guess = None
        self.h5_fit = None
        self.h5_results_grp = None

        # TODO: do NOT expose a lot of innards. Turn it into private with _var_name
        self.data = None
        self.guess = None
        self.fit = None

        self._fitter_name = None  # Reset this in the extended classes
        self._parms_dict = dict()
Example #9
0
    def __init__(self, h5_main, estimator):
        """
        Uses the provided (preconfigured) Decomposition object to 
        decompose the provided dataset
        
        Parameters
        ------------
        h5_main : HDF5 dataset object
            Main dataset with ancillary spectroscopic, position indices and values datasets
        estimator : sklearn.cluster estimator object
            configured decomposition object to apply to the data
        """
        
        allowed_methods = [dec.factor_analysis.FactorAnalysis,
                           dec.fastica_.FastICA,
                           dec.incremental_pca.IncrementalPCA,
                           dec.sparse_pca.MiniBatchSparsePCA,
                           dec.nmf.NMF,
                           dec.pca.PCA,
                           dec.sparse_pca.SparsePCA,
                           dec.truncated_svd.TruncatedSVD]
        
        # Store the decomposition object
        self.estimator = estimator
        
        # could not find a nicer way to extract the method name yet
        self.method_name = str(estimator)[:str(estimator).index('(')]

        if type(estimator) not in allowed_methods:
            raise NotImplementedError('Cannot work with {} yet'.format(self.method_name))
            
        # Done with decomposition-related checks, now call super init
        super(Decomposition, self).__init__(h5_main)
        
        # set up parameters
        self.parms_dict = {'decomposition_algorithm':self.method_name}
        self.parms_dict.update(self.estimator.get_params())
        
        # check for existing datagroups with same results 
        self.process_name = 'Decomposition'
        # Partial groups don't make any sense for statistical learning algorithms....
        self.duplicate_h5_groups, self.h5_partial_groups = self._check_for_duplicates()

        # figure out the operation that needs need to be performed to convert to real scalar
        (self.data_transform_func, self.data_is_complex, self.data_is_compound,
         self.data_n_features, self.data_type_mult) = check_dtype(h5_main)

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)
        
        self.__components = None
        self.__projection = None
Example #10
0
    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.

        Parameters
        ----------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset that will be decomposed
        num_components : int, optional
            Number of components to decompose h5_main into.  Default None.
        kwargs
            Arguments to be sent to Process
        """
        super(SVD, self).__init__(h5_main, **kwargs)
        self.process_name = 'SVD'

        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates()

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None
Example #11
0
    def _create_fit_datasets(self):
        """
        Creates the HDF5 fit dataset. pycroscopy requires that the h5 group, guess dataset,
        corresponding spectroscopic and position datasets be created and populated at this point.
        This function will create the HDF5 dataset for the fit and link it to same ancillary datasets as the guess.
        The fit dataset will NOT be populated here but will instead be populated using the __setData function
        """

        if self.h5_guess is None:
            warn('Need to guess before fitting!')
            return

        if self.step_start_inds is None:
            h5_spec_inds = self.h5_main.h5_spec_inds
            self.step_start_inds = np.where(h5_spec_inds[0] == 0)[0]

        if self.num_udvs_steps is None:
            self.num_udvs_steps = len(self.step_start_inds)

        if self.freq_vec is None:
            self._get_frequency_vector()

        h5_sho_grp = self.h5_guess.parent
        write_simple_attrs(h5_sho_grp, {'SHO_fit_method': "pycroscopy BESHO"})

        # Create the fit dataset as an empty dataset of the same size and dtype as the guess.
        # Also automatically links in the ancillary datasets.
        self.h5_fit = USIDataset(
            create_empty_dataset(self.h5_guess, dtype=sho32, dset_name='Fit'))

        # This is necessary comparing against new runs to avoid re-computation + resuming partial computation
        write_simple_attrs(self.h5_fit, self._parms_dict)
        write_simple_attrs(self.h5_fit, {
            'SHO_fit_method': "pycroscopy BESHO",
            'last_pixel': 0
        })

        self.h5_fit.file.flush()
Example #12
0
    def __init__(self,
                 h5_main,
                 ex_freq,
                 gain,
                 num_x_steps=250,
                 r_extra=110,
                 **kwargs):
        """
        Applies Bayesian Inference to General Mode IV (G-IV) data to extract the true current

        Parameters
        ----------
        h5_main : h5py.Dataset object
            Dataset to process
        ex_freq : float
            Frequency of the excitation waveform
        gain : uint
            Gain setting on current amplifier (typically 7-9)
        num_x_steps : uint (Optional, default = 250)
            Number of steps for the inferred results. Note: this may be end up being slightly different from specified.
        r_extra : float (Optional, default = 110 [Ohms])
            Extra resistance in the RC circuit that will provide correct current and resistance values
        kwargs : dict
            Other parameters specific to the Process class and nuanced bayesian_inference parameters
        """
        super(GIVBayesian, self).__init__(h5_main, **kwargs)
        self.gain = gain
        self.ex_freq = ex_freq
        self.r_extra = r_extra
        self.num_x_steps = int(num_x_steps)
        if self.num_x_steps % 4 == 0:
            self.num_x_steps = ((self.num_x_steps // 2) + 1) * 2
        if self.verbose and self.mpi_rank == 0:
            print('ensuring that half steps should be odd, num_x_steps is now',
                  self.num_x_steps)

        self.h5_main = USIDataset(self.h5_main)

        # take these from kwargs
        bayesian_parms = {
            'gam': 0.03,
            'e': 10.0,
            'sigma': 10.0,
            'sigmaC': 1.0,
            'num_samples': 2E3
        }

        self.parms_dict = {
            'freq': self.ex_freq,
            'num_x_steps': self.num_x_steps,
            'r_extra': self.r_extra
        }
        self.parms_dict.update(bayesian_parms)

        self.process_name = 'Bayesian_Inference'
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # Should not be extracting excitation this way!
        h5_spec_vals = self.h5_main.h5_spec_vals[0]
        self.single_ao = np.squeeze(h5_spec_vals[()])

        roll_cyc_fract = -0.25
        self.roll_pts = int(self.single_ao.size * roll_cyc_fract)
        self.rolled_bias = np.roll(self.single_ao, self.roll_pts)

        dt = 1 / (ex_freq * self.single_ao.size)
        self.dvdt = np.diff(self.single_ao) / dt
        self.dvdt = np.append(self.dvdt, self.dvdt[-1])

        self.reverse_results = None
        self.forward_results = None
        self._bayes_parms = None

        self.__first_batch = True
Example #13
0
def plot_cluster_h5_group(h5_group, labels_kwargs=None, centroids_kwargs=None):
    """
    Plots the cluster labels and mean response for each cluster

    Parameters
    ----------
    h5_group : h5py.Datagroup object
        H5 group containing the labels and mean response
    labels_kwargs : dict, optional
        keyword arguments for the labels plot. NOT enabled yet.
    centroids_kwargs : dict, optional
        keyword arguments for the centroids plot. NOT enabled yet.

    Returns
    -------
    fig_labels : figure handle
        Figure containing the labels
    fig_centroids : figure handle
        Figure containing the centroids
    """
    if not isinstance(h5_group, h5py.Group):
        raise TypeError('h5_group should be a h5py.Group')
    h5_labels = USIDataset(h5_group['Labels'])
    h5_centroids = USIDataset(h5_group['Mean_Response'])

    labels_mat = np.squeeze(h5_labels.get_n_dim_form())
    if labels_mat.ndim > 3:
        print('Unable to visualize 4 or more dimensional labels!')
    if labels_mat.ndim == 1:
        fig_labs, axis_labs = plt.subplots(figsize=(5.5, 5))
        axis_labs.plot(h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]),
                       labels_mat)
        axis_labs.set_xlabel(h5_labels.pos_dim_descriptors[0])
        axis_labs.set_ylabel('Cluster index')
        axis_labs.set_title(
            get_attr(h5_group, 'cluster_algorithm') + ' Labels')
    elif labels_mat.ndim == 2:
        fig_labs, axis_labs = plot_cluster_labels(
            labels_mat,
            num_clusters=h5_centroids.shape[0],
            x_label=h5_labels.pos_dim_descriptors[0],
            y_label=h5_labels.pos_dim_descriptors[1],
            x_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]),
            y_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[1]),
            title=get_attr(h5_group, 'cluster_algorithm') + ' Labels')

    # TODO: probably not a great idea to load the entire dataset to memory
    centroids_mat = h5_centroids.get_n_dim_form()
    if len(h5_centroids.spec_dim_labels) == 1:
        legend_mode = 2
        if h5_centroids.shape[0] < 6:
            legend_mode = 1
        fig_cent, axis_cent = plot_cluster_centroids(
            centroids_mat,
            h5_centroids.get_spec_values(h5_centroids.spec_dim_labels[0]),
            legend_mode=legend_mode,
            x_label=h5_centroids.spec_dim_descriptors[0],
            y_label=h5_centroids.data_descriptor,
            overlayed=h5_centroids.shape[0] < 6,
            title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid',
            amp_units=get_attr(h5_centroids, 'units'))
    elif len(h5_centroids.spec_dim_labels) == 2:
        # stack of spectrograms
        if h5_centroids.dtype in [np.complex64, np.complex128, np.complex]:
            fig_cent, axis_cent = plot_complex_spectra(
                centroids_mat,
                subtitle_prefix='Cluster',
                title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid',
                x_label=h5_centroids.spec_dim_descriptors[0],
                y_label=h5_centroids.spec_dim_descriptors[1],
                amp_units=get_attr(h5_centroids, 'units'))
        else:
            fig_cent, axis_cent = plot_map_stack(
                centroids_mat,
                color_bar_mode='each',
                evenly_spaced=True,
                title='Cluster',
                heading=get_attr(h5_group, 'cluster_algorithm') + ' Centroid')
    return fig_labs, fig_cent
Example #14
0
    def __init__(self, h5_main, estimator, num_comps=None, **kwargs):
        """
        Constructs the Cluster object. Call the :meth:`~pycroscopy.processing.Cluster.test()` and
        :meth:`~pycroscopy.processing.Cluster.compute()` methods to run the clustering

        Parameters
        ----------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset
        estimator : :class:`sklearn.cluster` estimator
            configured clustering algorithm to be applied to the data
        num_comps : int (unsigned), optional. Default = None / all
            Number of features / spectroscopic indices to be used to cluster the data
        h5_target_group : h5py.Group, optional. Default = None
            Location where to look for existing results and to place newly
            computed results. Use this kwarg if the results need to be written
            to a different HDF5 file. By default, this value is set to the
            parent group containing `h5_main`
        """

        allowed_methods = [
            cls.AgglomerativeClustering, cls.Birch, cls.KMeans,
            cls.MiniBatchKMeans, cls.SpectralClustering
        ]

        # could not find a nicer way to extract the method name yet
        self.method_name = str(estimator)[:str(estimator).index('(')]

        if type(estimator) not in allowed_methods:
            raise TypeError('Cannot work with {} just yet'.format(
                self.method_name))

        # Done with decomposition-related checks, now call super init
        super(Cluster, self).__init__(h5_main, 'Cluster', **kwargs)

        # Store the decomposition object
        self.estimator = estimator

        if num_comps is None:
            comp_attr = 'all'

        comp_slice, num_comps = get_component_slice(
            num_comps, total_components=self.h5_main.shape[1])

        self.num_comps = num_comps
        self.data_slice = (slice(None), comp_slice)

        if isinstance(comp_slice, slice):
            # cannot store slice as an attribute in hdf5
            # convert to list of integers!
            inds = comp_slice.indices(self.h5_main.shape[1])
            # much like range, inds are arranged as (start, stop, step)
            if inds[0] == 0 and inds[2] == 1:
                # starting from 0 with step of 1 = upto N components
                if inds[1] >= self.h5_main.shape[1] - 1:
                    comp_attr = 'all'
                else:
                    comp_attr = inds[1]
            else:
                comp_attr = range(*inds)
        elif comp_attr == 'all':
            pass
        else:
            # subset of spectral components specified as an array
            comp_attr = comp_slice

        # set up parameters
        self.parms_dict = {
            'cluster_algorithm': self.method_name,
            'spectral_components': comp_attr
        }
        self.parms_dict.update(self.estimator.get_params())

        # update n_jobs according to the cores argument
        # print('cores reset to', self._cores)
        # different number of cores should not* be a reason for different results
        # so we update this flag only after checking for duplicates
        estimator.n_jobs = self._cores
        self.parms_dict.update({'n_jobs': self._cores})

        # check for existing datagroups with same results
        # Partial groups don't make any sense for statistical learning algorithms....
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # figure out the operation that needs need to be performed to convert to real scalar
        (self.data_transform_func, self.data_is_complex, self.data_is_compound,
         self.data_n_features, self.data_type_mult) = check_dtype(h5_main)

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__labels = None
        self.__mean_resp = None
Example #15
0
def reshape_from_lines_to_pixels(h5_main, pts_per_cycle, scan_step_x_m=None):
    """
    Breaks up the provided raw G-mode dataset into lines and pixels (from just lines)

    Parameters
    ----------
    h5_main : h5py.Dataset object
        Reference to the main dataset that contains the raw data that is only broken up by lines
    pts_per_cycle : unsigned int
        Number of points in a single pixel
    scan_step_x_m : float
        Step in meters for pixels

    Returns
    -------
    h5_resh : h5py.Dataset object
        Reference to the main dataset that contains the reshaped data
    """
    if not check_if_main(h5_main):
        raise TypeError('h5_main is not a Main dataset')
    h5_main = USIDataset(h5_main)
    if pts_per_cycle % 1 != 0 or pts_per_cycle < 1:
        raise TypeError('pts_per_cycle should be a positive integer')
    if scan_step_x_m is not None:
        if not isinstance(scan_step_x_m, Number):
            raise TypeError('scan_step_x_m should be a real number')
    else:
        scan_step_x_m = 1

    if h5_main.shape[1] % pts_per_cycle != 0:
        warn(
            'Error in reshaping the provided dataset to pixels. Check points per pixel'
        )
        raise ValueError

    num_cols = int(h5_main.shape[1] / pts_per_cycle)

    # TODO: DO NOT assume simple 1 spectral dimension!
    single_ao = np.squeeze(h5_main.h5_spec_vals[:, :pts_per_cycle])

    spec_dims = Dimension(
        get_attr(h5_main.h5_spec_vals, 'labels')[0],
        get_attr(h5_main.h5_spec_vals, 'units')[0], single_ao)

    # TODO: DO NOT assume simple 1D in positions!
    pos_dims = [
        Dimension('X', 'm', np.linspace(0, scan_step_x_m, num_cols)),
        Dimension('Y', 'm',
                  np.linspace(0, h5_main.h5_pos_vals[1, 0], h5_main.shape[0]))
    ]

    h5_group = create_results_group(h5_main, 'Reshape')
    # TODO: Create empty datasets and then write for very large datasets
    h5_resh = write_main_dataset(h5_group,
                                 (num_cols * h5_main.shape[0], pts_per_cycle),
                                 'Reshaped_Data',
                                 get_attr(h5_main, 'quantity')[0],
                                 get_attr(h5_main, 'units')[0],
                                 pos_dims,
                                 spec_dims,
                                 chunks=(10, pts_per_cycle),
                                 dtype=h5_main.dtype,
                                 compression=h5_main.compression)

    # TODO: DON'T write in one shot assuming small datasets fit in memory!
    print('Starting to reshape G-mode line data. Please be patient')
    h5_resh[()] = np.reshape(h5_main[()], (-1, pts_per_cycle))

    print('Finished reshaping G-mode line data to rows and columns')

    return USIDataset(h5_resh)
Example #16
0
def plot_svd(h5_main, savefig=False, num_plots=16, **kwargs):
    '''
    Replots the SVD showing the skree, abundance maps, and eigenvectors.
    If h5_main is a Dataset, it will default to the most recent SVD group from that
    Dataset.
    If h5_main is the results group, then it will plot the values for that group.
    
    :param h5_main:
    :type h5_main: USIDataset or h5py Dataset or h5py Group
    
    :param savefig: Saves the figures to disk with some default names
    :type savefig: bool, optional
        
    :param num_plots: Default number of eigenvectors and abundance plots to show
    :type num_plots: int
        
    :param kwargs: keyword arguments for svd filtering
    :type kwarrgs: dict, optional
        
    '''

    if isinstance(h5_main, h5py.Group):

        _U = find_dataset(h5_main, 'U')[-1]
        _V = find_dataset(h5_main, 'V')[-1]
        units = 'arbitrary (a.u.)'
        h5_spec_vals = np.arange(_V.shape[1])
        h5_svd_group = _U.parent

    else:

        h5_svd_group = find_results_groups(h5_main, 'SVD')[-1]
        units = h5_main.attrs['quantity']
        h5_spec_vals = h5_main.get_spec_values('Time')

    h5_U = h5_svd_group['U']
    h5_V = h5_svd_group['V']
    h5_S = h5_svd_group['S']

    _U = USIDataset(h5_U)
    [num_rows, num_cols] = _U.pos_dim_sizes

    abun_maps = np.reshape(h5_U[:, :16], (num_rows, num_cols, -1))
    eigen_vecs = h5_V[:16, :]

    skree_sum = np.zeros(h5_S.shape)
    for i in range(h5_S.shape[0]):
        skree_sum[i] = np.sum(h5_S[:i]) / np.sum(h5_S)

    plt.figure()
    plt.plot(skree_sum, 'bo')
    plt.title('Cumulative Variance')
    plt.xlabel('Total Components')
    plt.ylabel('Total variance ratio (a.u.)')

    if savefig:
        plt.savefig('Cumulative_variance_plot.png')

    fig_skree, axes = plot_utils.plot_scree(h5_S, title='Scree plot')
    fig_skree.tight_layout()

    if savefig:
        plt.savefig('Scree_plot.png')

    fig_abun, axes = plot_utils.plot_map_stack(abun_maps,
                                               num_comps=num_plots,
                                               title='SVD Abundance Maps',
                                               color_bar_mode='single',
                                               cmap='inferno',
                                               reverse_dims=True,
                                               fig_mult=(3.5, 3.5),
                                               facecolor='white',
                                               **kwargs)
    fig_abun.tight_layout()
    if savefig:
        plt.savefig('Abundance_maps.png')

    fig_eigvec, axes = plot_utils.plot_curves(h5_spec_vals * 1e3,
                                              eigen_vecs,
                                              use_rainbow_plots=False,
                                              x_label='Time (ms)',
                                              y_label=units,
                                              num_plots=num_plots,
                                              subtitle_prefix='Component',
                                              title='SVD Eigenvectors',
                                              evenly_spaced=False,
                                              **kwargs)
    fig_eigvec.tight_layout()
    if savefig:
        plt.savefig('Eigenvectors.png')

    return
Example #17
0
    def _setup_h5(self, data_gen_parms):
        """
        Setups up the hdf5 file structure before doing the actual generation

        Parameters
        ----------
        data_gen_parms : dict
            Dictionary containing the parameters to write to the Measurement Group as attributes

        Returns
        -------

        """
        '''
        Build the group structure down to the channel group
        '''
        # Set up the basic group structure
        root_grp = VirtualGroup('')
        root_parms = generate_dummy_main_parms()
        root_parms['translator'] = 'FAKEBEPS'
        root_parms['data_type'] = data_gen_parms['data_type']
        root_grp.attrs = root_parms

        meas_grp = VirtualGroup('Measurement_')
        chan_grp = VirtualGroup('Channel_')

        meas_grp.attrs.update(data_gen_parms)

        # Create the Position and Spectroscopic datasets for the Raw Data
        ds_pos_inds, ds_pos_vals, ds_spec_inds, ds_spec_vals = self._build_ancillary_datasets(
        )

        raw_chunking = calc_chunks([self.n_pixels, self.n_spec_bins],
                                   np.complex64(0).itemsize,
                                   unit_chunks=[1, self.n_bins])

        ds_raw_data = VirtualDataset(
            'Raw_Data',
            data=None,
            maxshape=[self.n_pixels, self.n_spec_bins],
            dtype=np.complex64,
            compression='gzip',
            chunking=raw_chunking,
            parent=meas_grp)

        chan_grp.add_children([
            ds_pos_inds, ds_pos_vals, ds_spec_inds, ds_spec_vals, ds_raw_data
        ])
        meas_grp.add_children([chan_grp])
        root_grp.add_children([meas_grp])

        hdf = HDFwriter(self.h5_path)
        hdf.delete()
        h5_refs = hdf.write(root_grp)

        # Delete the MicroDatasets to save memory
        del ds_raw_data, ds_spec_inds, ds_spec_vals, ds_pos_inds, ds_pos_vals

        # Get the file and Raw_Data objects
        h5_raw = get_h5_obj_refs(['Raw_Data'], h5_refs)[0]
        h5_chan_grp = h5_raw.parent

        # Get the Position and Spectroscopic dataset objects
        h5_pos_inds = get_h5_obj_refs(['Position_Indices'], h5_refs)[0]
        h5_pos_vals = get_h5_obj_refs(['Position_Values'], h5_refs)[0]
        h5_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'], h5_refs)[0]
        h5_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'], h5_refs)[0]

        # Link the Position and Spectroscopic datasets as attributes of Raw_Data
        link_as_main(h5_raw, h5_pos_inds, h5_pos_vals, h5_spec_inds,
                     h5_spec_vals)
        '''
        Build the SHO Group
        '''
        sho_grp = VirtualGroup('Raw_Data-SHO_Fit_', parent=h5_chan_grp.name)

        # Build the Spectroscopic datasets for the SHO Guess and Fit
        sho_spec_starts = np.where(
            h5_spec_inds[h5_spec_inds.attrs['Frequency']].squeeze() == 0)[0]
        sho_spec_labs = get_attr(h5_spec_inds, 'labels')
        ds_sho_spec_inds, ds_sho_spec_vals = build_reduced_spec_dsets(
            h5_spec_inds,
            h5_spec_vals,
            keep_dim=sho_spec_labs != 'Frequency',
            step_starts=sho_spec_starts)

        sho_chunking = calc_chunks([self.n_pixels, self.n_sho_bins],
                                   sho32.itemsize,
                                   unit_chunks=[1, 1])
        ds_sho_fit = VirtualDataset('Fit',
                                    data=None,
                                    maxshape=[self.n_pixels, self.n_sho_bins],
                                    dtype=sho32,
                                    compression='gzip',
                                    chunking=sho_chunking,
                                    parent=sho_grp)
        ds_sho_guess = VirtualDataset(
            'Guess',
            data=None,
            maxshape=[self.n_pixels, self.n_sho_bins],
            dtype=sho32,
            compression='gzip',
            chunking=sho_chunking,
            parent=sho_grp)

        sho_grp.add_children(
            [ds_sho_fit, ds_sho_guess, ds_sho_spec_inds, ds_sho_spec_vals])

        # Write the SHO group and datasets to the file and delete the MicroDataset objects
        h5_sho_refs = hdf.write(sho_grp)
        del ds_sho_fit, ds_sho_guess, ds_sho_spec_inds, ds_sho_spec_vals

        # Get the dataset handles for the fit and guess
        h5_sho_fit = get_h5_obj_refs(['Fit'], h5_sho_refs)[0]
        h5_sho_guess = get_h5_obj_refs(['Guess'], h5_sho_refs)[0]

        # Get the dataset handles for the SHO Spectroscopic datasets
        h5_sho_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'],
                                           h5_sho_refs)[0]
        h5_sho_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'],
                                           h5_sho_refs)[0]

        # Link the Position and Spectroscopic datasets as attributes of the SHO Fit and Guess
        link_as_main(h5_sho_fit, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds,
                     h5_sho_spec_vals)
        link_as_main(h5_sho_guess, h5_pos_inds, h5_pos_vals, h5_sho_spec_inds,
                     h5_sho_spec_vals)
        '''
        Build the loop group
        '''
        loop_grp = VirtualGroup('Fit-Loop_Fit_', parent=h5_sho_fit.parent.name)

        # Build the Spectroscopic datasets for the loops
        loop_spec_starts = np.where(h5_sho_spec_inds[
            h5_sho_spec_inds.attrs['DC_Offset']].squeeze() == 0)[0]
        loop_spec_labs = get_attr(h5_sho_spec_inds, 'labels')
        ds_loop_spec_inds, ds_loop_spec_vals = build_reduced_spec_dsets(
            h5_sho_spec_inds,
            h5_sho_spec_vals,
            keep_dim=loop_spec_labs != 'DC_Offset',
            step_starts=loop_spec_starts)

        # Create the loop fit and guess MicroDatasets
        loop_chunking = calc_chunks([self.n_pixels, self.n_loops],
                                    loop_fit32.itemsize,
                                    unit_chunks=[1, 1])
        ds_loop_fit = VirtualDataset('Fit',
                                     data=None,
                                     maxshape=[self.n_pixels, self.n_loops],
                                     dtype=loop_fit32,
                                     compression='gzip',
                                     chunking=loop_chunking,
                                     parent=loop_grp)

        ds_loop_guess = VirtualDataset('Guess',
                                       data=None,
                                       maxshape=[self.n_pixels, self.n_loops],
                                       dtype=loop_fit32,
                                       compression='gzip',
                                       chunking=loop_chunking,
                                       parent=loop_grp)

        # Add the datasets to the loop group then write it to the file
        loop_grp.add_children(
            [ds_loop_fit, ds_loop_guess, ds_loop_spec_inds, ds_loop_spec_vals])
        h5_loop_refs = hdf.write(loop_grp)

        # Delete the MicroDatasets
        del ds_loop_spec_vals, ds_loop_spec_inds, ds_loop_guess, ds_loop_fit

        # Get the handles to the datasets
        h5_loop_fit = get_h5_obj_refs(['Fit'], h5_loop_refs)[0]
        h5_loop_guess = get_h5_obj_refs(['Guess'], h5_loop_refs)[0]
        h5_loop_spec_inds = get_h5_obj_refs(['Spectroscopic_Indices'],
                                            h5_loop_refs)[0]
        h5_loop_spec_vals = get_h5_obj_refs(['Spectroscopic_Values'],
                                            h5_loop_refs)[0]

        # Link the Position and Spectroscopic datasets to the Loop Guess and Fit
        link_as_main(h5_loop_fit, h5_pos_inds, h5_pos_vals, h5_loop_spec_inds,
                     h5_loop_spec_vals)
        link_as_main(h5_loop_guess, h5_pos_inds, h5_pos_vals,
                     h5_loop_spec_inds, h5_loop_spec_vals)

        self.h5_raw = USIDataset(h5_raw)
        self.h5_sho_guess = USIDataset(h5_sho_guess)
        self.h5_sho_fit = USIDataset(h5_sho_fit)
        self.h5_loop_guess = USIDataset(h5_loop_guess)
        self.h5_loop_fit = USIDataset(h5_loop_fit)
        self.h5_spec_vals = h5_spec_vals
        self.h5_spec_inds = h5_spec_inds
        self.h5_sho_spec_inds = h5_sho_spec_inds
        self.h5_sho_spec_vals = h5_sho_spec_vals
        self.h5_loop_spec_inds = h5_loop_spec_inds
        self.h5_loop_spec_vals = h5_loop_spec_vals
        self.h5_file = h5_raw.file

        return
Example #18
0
class SVD(Process):
    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.

        Parameters
        ----------
        h5_main : USIDataset
            Dataset to be decomposed.
        num_components : int, optional
            Number of components to decompose h5_main into.  Default None.
        kwargs
            Arguments to be sent to Process
        """
        super(SVD, self).__init__(h5_main, **kwargs)
        self.process_name = 'SVD'
        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(
            h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None

    def test(self, override=False):
        """
        Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to
        write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the
        input matrix.

        Parameters
        ----------
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
        U : numpy.ndarray
            Abundance matrix
        S : numpy.ndarray
            variance vector
        V : numpy.ndarray
            eigenvector matrix
        """
        '''
        Check if a number of compnents has been set and ensure that the number is less than
        the minimum axis length of the data.  If both conditions are met, use fsvd.  If not
        use the regular svd.

        C.Smith -- We might need to put a lower limit on num_comps in the future.  I don't
                   know enough about svd to be sure.
        '''
        if not override:
            if isinstance(self.duplicate_h5_groups,
                          list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(
                    self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \
                       reshape_to_n_dims(self.h5_results_grp['V'])[0]

        self.h5_results_grp = None

        t1 = time.time()

        self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func(
            self.h5_main),
                                                      self.num_components,
                                                      n_iter=3)
        self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype)

        print('Took {} to compute randomized SVD'.format(
            format_time(time.time() - t1)))

        u_mat, success = reshape_to_n_dims(self.__u,
                                           h5_pos=self.h5_main.h5_pos_inds,
                                           h5_spec=np.expand_dims(np.arange(
                                               self.__u.shape[1]),
                                                                  axis=0))
        if not success:
            raise ValueError(
                'Could not reshape U to N-Dimensional dataset! Error:' +
                success)

        v_mat, success = reshape_to_n_dims(self.__v,
                                           h5_pos=np.expand_dims(np.arange(
                                               self.__u.shape[1]),
                                                                 axis=1),
                                           h5_spec=self.h5_main.h5_spec_inds)
        if not success:
            raise ValueError(
                'Could not reshape V to N-Dimensional dataset! Error:' +
                success)

        return u_mat, self.__s, v_mat

    def compute(self, override=False):
        """
        Computes SVD (by calling test_on_subset() if it has not already been called) and writes results to file.
        Consider calling test() to check results before writing to file. Results are deleted from memory
        upon writing to the HDF5 file

        Parameters
        ----------
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
         h5_results_grp : h5py.Datagroup object
            Datagroup containing all the results
        """
        if self.__u is None and self.__v is None and self.__s is None:
            self.test(override=override)

        if self.h5_results_grp is None:
            self._write_results_chunk()
            self.delete_results()

        h5_group = self.h5_results_grp

        return h5_group

    def delete_results(self):
        """
        Deletes results from memory.
        """
        del self.__u, self.__s, self.__v
        self.__u = None
        self.__v = None
        self.__s = None

    def _write_results_chunk(self):
        """
        Writes the provided SVD results to file

        Parameters
        ----------
        """
        comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s))

        h5_svd_group = create_results_group(self.h5_main, self.process_name)
        self.h5_results_grp = h5_svd_group

        write_simple_attrs(h5_svd_group, self.parms_dict)
        write_simple_attrs(
            h5_svd_group, {
                'svd_method': 'sklearn-randomized',
                'last_pixel': self.h5_main.shape[0]
            })

        h5_u = write_main_dataset(h5_svd_group,
                                  np.float32(self.__u),
                                  'U',
                                  'Abundance',
                                  'a.u.',
                                  None,
                                  comp_dim,
                                  h5_pos_inds=self.h5_main.h5_pos_inds,
                                  h5_pos_vals=self.h5_main.h5_pos_vals,
                                  dtype=np.float32,
                                  chunks=calc_chunks(self.__u.shape,
                                                     np.float32(0).itemsize))
        # print(get_attr(self.h5_main, 'quantity')[0])
        h5_v = write_main_dataset(h5_svd_group,
                                  self.__v,
                                  'V',
                                  get_attr(self.h5_main, 'quantity')[0],
                                  'a.u.',
                                  comp_dim,
                                  None,
                                  h5_spec_inds=self.h5_main.h5_spec_inds,
                                  h5_spec_vals=self.h5_main.h5_spec_vals,
                                  chunks=calc_chunks(
                                      self.__v.shape,
                                      self.h5_main.dtype.itemsize))

        # No point making this 1D dataset a main dataset
        h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s))
        '''
        Check h5_main for plot group references.
        Copy them into V if they exist
        '''
        for key in self.h5_main.attrs.keys():
            if '_Plot_Group' not in key:
                continue

            ref_inds = get_indices_for_region_ref(self.h5_main,
                                                  self.h5_main.attrs[key],
                                                  return_method='corners')
            ref_inds = ref_inds.reshape([-1, 2, 2])
            ref_inds[:, 1, 0] = h5_v.shape[0] - 1

            svd_ref = create_region_reference(h5_v, ref_inds)

            h5_v.attrs[key] = svd_ref

    def _check_available_mem(self):
        """
        Check that there is enough memory to perform the SVD decomposition.

        Returns
        -------
        sufficient_mem : bool
            True is enough memory found, False otherwise.

        """
        if self.verbose:
            print('Checking memory availability.')
        n_samples, n_features = self.h5_main.shape
        s_mem_per_comp = np.float32(0).itemsize
        u_mem_per_comp = np.float32(0).itemsize * n_samples
        v_mem_per_comp = self.h5_main.dtype.itemsize * n_features

        mem_per_comp = s_mem_per_comp + u_mem_per_comp + v_mem_per_comp
        avail_mem = 0.75 * self._max_mem_mb * 1024**2
        free_mem = avail_mem - self.h5_main.__sizeof__()

        if free_mem <= 0:
            error_message = 'Cannot load main dataset into memory.\n' + \
                            'Available memory is {}.  Dataset needs {}.'.format(avail_mem,
                                                                                self.h5_main.__sizeof__())
            raise MemoryError()

        if self.verbose:
            print('Memory available for SVD is {}.'.format(free_mem))
            print('Memory needed per component is {}.'.format(mem_per_comp))

        cant_svd = (free_mem - self.num_components * mem_per_comp) <= 0

        if cant_svd:
            max_comps = np.floor(free_mem / mem_per_comp, dtype=int)
            error_message = 'Not enough free memory for performing SVD with requested number of parameters.\n' + \
                            'Maximum possible parameters is {}.'.format(max_comps)
            raise MemoryError(error_message)
Example #19
0
def plot_cluster_h5_group(h5_group, labels_kwargs=None, centroids_kwargs=None):
    """
    Plots the cluster labels and mean response for each cluster

    Parameters
    ----------
    h5_group : h5py.Datagroup object
        H5 group containing the labels and mean response
    labels_kwargs : dict, optional
        keyword arguments for the labels plot. NOT enabled yet.
    centroids_kwargs : dict, optional
        keyword arguments for the centroids plot. NOT enabled yet.

    Returns
    -------
    fig_labels : figure handle
        Figure containing the labels
    fig_centroids : figure handle
        Figure containing the centroids
    """
    if not isinstance(h5_group, h5py.Group):
        raise TypeError('h5_group should be a h5py.Group')
    h5_labels = USIDataset(h5_group['Labels'])
    h5_centroids = USIDataset(h5_group['Mean_Response'])

    labels_mat = np.squeeze(h5_labels.get_n_dim_form())
    if labels_mat.ndim > 3:
        print('Unable to visualize 4 or more dimensional labels!')
    if labels_mat.ndim == 1:
        fig_labs, axis_labs = plt.subplots(figsize=(5.5, 5))
        axis_labs.plot(h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]), labels_mat)
        axis_labs.set_xlabel(h5_labels.pos_dim_descriptors[0])
        axis_labs.set_ylabel('Cluster index')
        axis_labs.set_title(get_attr(h5_group, 'cluster_algorithm') + ' Labels')
    elif labels_mat.ndim == 2:
        fig_labs, axis_labs = plot_cluster_labels(labels_mat, num_clusters=h5_centroids.shape[0],
                                                  x_label=h5_labels.pos_dim_descriptors[0],
                                                  y_label=h5_labels.pos_dim_descriptors[1],
                                                  x_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[0]),
                                                  y_vec=h5_labels.get_pos_values(h5_labels.pos_dim_labels[1]),
                                                  title=get_attr(h5_group, 'cluster_algorithm') + ' Labels')

    # TODO: probably not a great idea to load the entire dataset to memory
    centroids_mat = h5_centroids.get_n_dim_form()
    if len(h5_centroids.spec_dim_labels) == 1:
        legend_mode = 2
        if h5_centroids.shape[0] < 6:
            legend_mode = 1
        fig_cent, axis_cent = plot_cluster_centroids(centroids_mat,
                                                     h5_centroids.get_spec_values(h5_centroids.spec_dim_labels[0]),
                                                     legend_mode=legend_mode,
                                                     x_label=h5_centroids.spec_dim_descriptors[0],
                                                     y_label=h5_centroids.data_descriptor,
                                                     overlayed=h5_centroids.shape[0] < 6,
                                                     title=get_attr(h5_group,
                                                                    'cluster_algorithm') + ' Centroid',
                                                     amp_units=get_attr(h5_centroids, 'units'))
    elif len(h5_centroids.spec_dim_labels) == 2:
        # stack of spectrograms
        if h5_centroids.dtype in [np.complex64, np.complex128, np.complex]:
            fig_cent, axis_cent = plot_complex_spectra(centroids_mat, subtitle_prefix='Cluster',
                                                       title=get_attr(h5_group, 'cluster_algorithm') + ' Centroid',
                                                       x_label=h5_centroids.spec_dim_descriptors[0],
                                                       y_label=h5_centroids.spec_dim_descriptors[1],
                                                       amp_units=get_attr(h5_centroids, 'units'))
        else:
            fig_cent, axis_cent = plot_map_stack(centroids_mat, color_bar_mode='each', evenly_spaced=True,
                                                 title='Cluster',
                                                 heading=get_attr(h5_group,
                                                                  'cluster_algorithm') + ' Centroid')
    return fig_labs, fig_cent
Example #20
0
def create_empty_dataset(source_dset,
                         dtype,
                         dset_name,
                         h5_group=None,
                         new_attrs=None,
                         skip_refs=False):
    """
    Creates an empty dataset in the h5 file based on the provided dataset in the same or specified group
    Parameters
    ----------
    source_dset : h5py.Dataset object
        Source object that provides information on the group and shape of the dataset
    dtype : dtype
        Data type of the fit / guess datasets
    dset_name : String / Unicode
        Name of the dataset
    h5_group : h5py.Group object, optional. Default = None
        Group within which this dataset will be created
    new_attrs : dictionary (Optional)
        Any new attributes that need to be written to the dataset
    skip_refs : boolean, optional
        Should ObjectReferences and RegionReferences be skipped when copying attributes from the
        `source_dset`
    Returns
    -------
    h5_new_dset : h5py.Dataset object
        Newly created dataset
    """
    import h5py
    from pyUSID.io.dtype_utils import validate_dtype
    from pyUSID.io.hdf_utils import copy_attributes, check_if_main, write_book_keeping_attrs
    from pyUSID import USIDataset
    import sys
    if sys.version_info.major == 3:
        unicode = str

    if not isinstance(source_dset, h5py.Dataset):
        raise TypeError('source_deset should be a h5py.Dataset object')
    _ = validate_dtype(dtype)
    if new_attrs is not None:
        if not isinstance(new_attrs, dict):
            raise TypeError('new_attrs should be a dictionary')
    else:
        new_attrs = dict()

    if h5_group is None:
        h5_group = source_dset.parent
    else:
        if not isinstance(h5_group, (h5py.Group, h5py.File)):
            raise TypeError(
                'h5_group should be a h5py.Group or h5py.File object')

    if not isinstance(dset_name, (str, unicode)):
        raise TypeError('dset_name should be a string')
    dset_name = dset_name.strip()
    if len(dset_name) == 0:
        raise ValueError('dset_name cannot be empty!')
    if '-' in dset_name:
        warn(
            'dset_name should not contain the "-" character. Reformatted name from:{} to '
            '{}'.format(dset_name, dset_name.replace('-', '_')))
    dset_name = dset_name.replace('-', '_')

    if dset_name in h5_group.keys():
        if isinstance(h5_group[dset_name], h5py.Dataset):
            warn('A dataset named: {} already exists in group: {}'.format(
                dset_name, h5_group.name))
            h5_new_dset = h5_group[dset_name]
            # Make sure it has the correct shape and dtype
            if any((source_dset.shape != h5_new_dset.shape,
                    dtype != h5_new_dset.dtype)):
                warn(
                    'Either the shape (existing: {} desired: {}) or dtype (existing: {} desired: {}) of the dataset '
                    'did not match with expectations. Deleting and creating a new one.'
                    .format(h5_new_dset.shape, source_dset.shape,
                            h5_new_dset.dtype, dtype))
                del h5_new_dset, h5_group[dset_name]
                h5_new_dset = h5_group.create_dataset(
                    dset_name,
                    shape=source_dset.shape,
                    dtype=dtype,
                    chunks=source_dset.chunks)
        else:
            raise KeyError('{} is already a {} in group: {}'.format(
                dset_name, type(h5_group[dset_name]), h5_group.name))

    else:
        h5_new_dset = h5_group.create_dataset(dset_name,
                                              shape=source_dset.shape,
                                              dtype=dtype,
                                              chunks=source_dset.chunks)

    # This should link the ancillary datasets correctly
    h5_new_dset = copy_attributes(source_dset,
                                  h5_new_dset,
                                  skip_refs=skip_refs)
    h5_new_dset.attrs.update(new_attrs)

    if check_if_main(h5_new_dset):
        h5_new_dset = USIDataset(h5_new_dset)
        # update book keeping attributes
        write_book_keeping_attrs(h5_new_dset)

    return h5_new_dset
Example #21
0
    def do_fit(self,
               processors=None,
               solver_type='least_squares',
               solver_options=None,
               obj_func=None,
               h5_partial_fit=None,
               h5_guess=None,
               override=False):
        """
        Generates the fit for the given dataset and writes back to file

        Parameters
        ----------
        processors : int
            Number of cpu cores the user wishes to run on.  The minimum of this and self._maxCpus is used.
        solver_type : str
            The name of the solver in scipy.optimize to use for the fit
        solver_options : dict
            Dictionary of parameters to pass to the solver specified by `solver_type`
        obj_func : dict
            Dictionary defining the class and method containing the function to be fit as well as any 
            additional function parameters.
        h5_partial_fit : h5py.group. optional, default = None
            Datagroup containing (partially computed) fit results. do_fit will resume computation if provided.
        h5_guess : h5py.group. optional, default = None
            Datagroup containing guess results. do_fit will use this if provided.
        override : bool, optional. default = False
            By default, will simply return duplicate results to avoid recomputing or resume computation on a
            group with partial results. Set to True to force fresh computation.

        Returns
        -------
        h5_results : h5py.Dataset object
            Dataset with the fit parameters
        """

        # ################## PREPARE THE SOLVER #######################################

        legit_solver = solver_type in scipy.optimize.__dict__.keys()

        if not legit_solver:
            raise KeyError(
                'Error: Objective Functions "%s" is not implemented in pycroscopy.analysis.Fit_Methods'
                % obj_func['obj_func'])

        obj_func_name = obj_func['obj_func']
        legit_obj_func = obj_func_name in Fit_Methods().methods

        if not legit_obj_func:
            raise KeyError(
                'Error: Solver "%s" does not exist!. For additional info see scipy.optimize\n'
                % solver_type)

        # ################## CHECK FOR DUPLICATES AND RESUME PARTIAL #######################################

        def _get_group_to_resume(legal_groups, provided_partial_fit):
            for h5_group in legal_groups:
                if h5_group['Fit'] == provided_partial_fit:
                    return h5_group
            return None

        def _resume_fit(fitter, h5_group):
            fitter.h5_guess = h5_group['Guess']
            fitter.h5_fit = h5_group['Fit']
            fitter._start_pos = fitter.h5_fit.attrs['last_pixel']

        def _start_fresh_fit(fitter, h5_guess_legal):
            fitter.h5_guess = h5_guess_legal
            fitter._create_fit_datasets()
            fitter._start_pos = 0

        # Prepare the parms dict that will be used for comparison:
        self._parms_dict = solver_options.copy()
        self._parms_dict.update({'solver_type': solver_type})
        self._parms_dict.update(obj_func)

        completed_guess, partial_fit_groups, completed_fits = self._check_for_old_fit(
        )

        override = override or (h5_partial_fit is not None
                                or h5_guess is not None)

        if not override:
            # First try to simply return completed results
            if len(completed_fits) > 0:
                print('Returned previously computed results at ' +
                      completed_fits[-1].name)
                self.h5_fit = USIDataset(completed_fits[-1])
                return

            # Next, attempt to resume automatically:
            elif len(partial_fit_groups) > 0:
                print(
                    'Will resume fitting in {}. '
                    'You can supply a dataset using the h5_partial_fit argument'
                    .format(partial_fit_groups[-1].name))
                _resume_fit(self, partial_fit_groups[-1])

            # Finally, attempt to do fresh fitting using completed Guess:
            elif len(completed_guess) > 0:
                print('Will use {} for generating new Fit. '
                      'You can supply a dataset using the h5_guess argument'.
                      format(completed_guess[-1].name))
                _start_fresh_fit(self, completed_guess[-1])

            else:
                raise ValueError(
                    'Could not find a compatible Guess to use for Fit. Call do_guess() before do_fit()'
                )

        else:
            if h5_partial_fit is not None:
                h5_group = _get_group_to_resume(partial_fit_groups,
                                                h5_partial_fit)
                if h5_group is None:
                    raise ValueError(
                        'Provided dataset with partial Fit was not found to be compatible'
                    )
                _resume_fit(self, h5_group)

            elif h5_guess is not None:
                if h5_guess not in completed_guess:
                    raise ValueError(
                        'Provided dataset with completed Guess was not found to be compatible'
                    )
                _start_fresh_fit(self, h5_guess)

            else:
                raise ValueError(
                    'Please provide a completed guess or partially completed Fit to resume'
                )

        # ################## BEGIN THE ACTUAL FITTING #######################################

        print("Using solver %s and objective function %s to fit your data\n" %
              (solver_type, obj_func['obj_func']))

        if processors is None:
            processors = self._maxCpus
        else:
            processors = min(processors, self._maxCpus)
        processors = recommend_cpu_cores(self._max_pos_per_read,
                                         processors,
                                         verbose=self._verbose)

        time_per_pix = 0
        num_pos = self.h5_main.shape[0] - self._start_pos
        orig_start_pos = self._start_pos

        print(
            'You can abort this computation at any time and resume at a later time!\n'
            '\tIf you are operating in a python console, press Ctrl+C or Cmd+C to abort\n'
            '\tIf you are in a Jupyter notebook, click on "Kernel">>"Interrupt"\n'
        )

        self._get_guess_chunk()
        self._get_data_chunk()

        while self.data is not None:

            t_start = tm.time()

            opt = Optimize(data=self.data,
                           guess=self.guess,
                           parallel=self._parallel)
            temp = opt.computeFit(processors=processors,
                                  solver_type=solver_type,
                                  solver_options=solver_options,
                                  obj_func=obj_func.copy())

            # TODO: need a different .reformatResults to process fitting results
            # reorder to get one numpy array out
            temp = self._reformat_results(temp, obj_func_name)
            self.fit = np.hstack(tuple(temp))

            # Write to file
            self._set_results(is_guess=False)

            # basic timing logs
            tot_time = np.round(tm.time() - t_start, decimals=2)  # in seconds
            if self._verbose:
                print('Done parallel computing in {} or {} per pixel'.format(
                    format_time(tot_time),
                    format_time(tot_time / self.data.shape[0])))
            if self._start_pos == orig_start_pos:
                time_per_pix = tot_time / self._end_pos  # in seconds
            else:
                time_remaining = (num_pos -
                                  self._end_pos) * time_per_pix  # in seconds
                print('Time remaining: ' + format_time(time_remaining))

            # get next batch of data
            self._get_guess_chunk()
            self._get_data_chunk()

        print('Completed computing fit. Writing to file.')

        return USIDataset(self.h5_fit)
Example #22
0
    def do_guess(self,
                 processors=None,
                 strategy=None,
                 options=dict(),
                 h5_partial_guess=None,
                 override=False):
        """
        Parameters
        ----------
        strategy: string (optional)
            Default is 'Wavelet_Peaks'.
            Can be one of ['wavelet_peaks', 'relative_maximum', 'gaussian_processes'].
            For updated list, run GuessMethods.methods
        processors : int (optional)
            Number of cores to use for computing. Default = all available - 2 cores
        options: dict
            Default, options for wavelet_peaks {"peaks_widths": np.array([10,200]), "peak_step":20}.
            Dictionary of options passed to strategy. For more info see GuessMethods documentation.
        h5_partial_guess : h5py.group. optional, default = None
            Datagroup containing (partially computed) guess results. do_guess will resume computation if provided.
        override : bool, optional. default = False
            By default, will simply return duplicate results to avoid recomputing or resume computation on a
            group with partial results. Set to True to force fresh computation.

        Returns
        -------
        h5_guess : h5py.Dataset
            Dataset containing guesses that can be passed on to do_fit()
        """
        gm = GuessMethods()
        if strategy not in gm.methods:
            raise KeyError(
                'Error: %s is not implemented in pycroscopy.analysis.GuessMethods to find guesses'
                % strategy)

        # ################## CHECK FOR DUPLICATES AND RESUME PARTIAL #######################################

        # Prepare the parms dict that will be used for comparison:
        self._parms_dict = options.copy()
        self._parms_dict.update({'strategy': strategy})

        # check for old:
        partial_dsets, completed_dsets = self._check_for_old_guess()

        if len(completed_dsets) == 0 and len(partial_dsets) == 0:
            print('No existing datasets found')
            override = True

        if not override:
            # First try to simply return any completed computation
            if len(completed_dsets) > 0:
                print('Returned previously computed results at ' +
                      completed_dsets[-1].name)
                self.h5_guess = USIDataset(completed_dsets[-1])
                return

            # Next attempt to resume automatically if nothing is provided
            if len(partial_dsets) > 0:
                # attempt to use whatever the user provided (if legal)
                target_partial_dset = partial_dsets[-1]
                if h5_partial_guess is not None:
                    if not isinstance(h5_partial_guess, h5py.Dataset):
                        raise ValueError(
                            'Provided parameter is not an h5py.Dataset object')
                    if h5_partial_guess not in partial_dsets:
                        raise ValueError(
                            'Provided dataset for partial Guesses is not compatible'
                        )
                    if self._verbose:
                        print('Provided partial Guess dataset was acceptable')
                    target_partial_dset = h5_partial_guess

                # Finally resume from this dataset
                print('Resuming computation in group: ' +
                      target_partial_dset.name)
                self.h5_guess = target_partial_dset
                self._start_pos = target_partial_dset.attrs['last_pixel']

        # No completed / partials available or forced via override:
        if self.h5_guess is None:
            if self._verbose:
                print('Starting a fresh computation!')
            self._start_pos = 0
            self._create_guess_datasets()

        # ################## BEGIN THE ACTUAL COMPUTING #######################################

        if processors is None:
            processors = self._maxCpus
        else:
            processors = min(int(processors), self._maxCpus)
        processors = recommend_cpu_cores(self._max_pos_per_read,
                                         processors,
                                         verbose=self._verbose)

        print("Using %s to find guesses...\n" % strategy)

        time_per_pix = 0
        num_pos = self.h5_main.shape[0] - self._start_pos
        orig_start_pos = self._start_pos

        print(
            'You can abort this computation at any time and resume at a later time!\n'
            '\tIf you are operating in a python console, press Ctrl+C or Cmd+C to abort\n'
            '\tIf you are in a Jupyter notebook, click on "Kernel">>"Interrupt"\n'
        )

        self._get_data_chunk()
        while self.data is not None:

            t_start = tm.time()

            opt = Optimize(data=self.data, parallel=self._parallel)
            temp = opt.computeGuess(processors=processors,
                                    strategy=strategy,
                                    options=options)

            # reorder to get one numpy array out
            temp = self._reformat_results(temp, strategy)
            self.guess = np.hstack(tuple(temp))

            # Write to file
            self._set_results(is_guess=True)

            # basic timing logs
            tot_time = np.round(tm.time() - t_start, decimals=2)  # in seconds
            if self._verbose:
                print('Done parallel computing in {} or {} per pixel'.format(
                    format_time(tot_time),
                    format_time(tot_time / self.data.shape[0])))
            if self._start_pos == orig_start_pos:
                time_per_pix = tot_time / self._end_pos  # in seconds
            else:
                time_remaining = (num_pos -
                                  self._end_pos) * time_per_pix  # in seconds
                print('Time remaining: ' + format_time(time_remaining))

            # get next batch of data
            self._get_data_chunk()

        print('Completed computing guess')
        print()
        return USIDataset(self.h5_guess)
Example #23
0
    def write_images(self):
        if bool(self.img_desc):
            for img_f, descriptors in self.img_desc.items():
                #check for existing spectrogram or image and link position/spec inds/vals
                #at most two channels worth of need to be checked
                try:
                    str_main = str(
                        usid.hdf_utils.get_all_main(
                            self.h5_f['Measurement_000/Channel_000']))
                    i_beg = str_main.find('located at: \n\t') + 14
                    i_end = str_main.find('\nData contains') - 1
                    data_loc = str_main[i_beg:i_end]
                    channel_data = USIDataset(self.h5_f[data_loc])
                    h5_pos_inds = channel_data.h5_pos_inds
                    h5_pos_vals = channel_data.h5_pos_vals
                    pos_dims = None
                    write_pos_vals = False
                    if channel_data.spec_dim_sizes[0] == 1:
                        h5_spec_inds = channel_data.h5_spec_inds
                        h5_spec_vals = channel_data.h5_spec_vals
                        spec_dims = None
                    #if channel 000 is spectrogram, check next dataset
                    elif channel_data.spec_dim_sizes[0] != 1:
                        str_main = str(
                            usid.hdf_utils.get_all_main(
                                self.h5_f['Measurement_000/Channel_001']))
                        i_beg = str_main.find('located at: \n\t') + 14
                        i_end = str_main.find('\nData contains') - 1
                        data_loc = str_main[i_beg:i_end]
                        channel_data = USIDataset(self.h5_f[data_loc])
                        #channel data is an image, & we link their spec inds/vals
                        if channel_data.spec_dim_sizes[0] == 1:
                            h5_spec_inds = channel_data.h5_spec_inds
                            h5_spec_vals = channel_data.h5_spec_vals
                            spec_dims = None

                #in case where channel does not exist, we make new spec/pos inds/vals
                except KeyError:
                    #pos dims
                    h5_pos_inds = None
                    h5_pos_vals = None
                    pos_dims = self.pos_dims
                    write_pos_vals = True
                    #spec dims
                    h5_spec_inds = None
                    h5_spec_vals = None
                    spec_dims = usid.write_utils.Dimension('arb', 'a.u', 1)

                channel_i = usid.hdf_utils.create_indexed_group(
                    self.h5_meas_grp, 'Channel_')
                h5_raw = usid.hdf_utils.write_main_dataset(
                    channel_i,  #parent HDF5 group
                    (self.x_len * self.y_len, 1),  # shape of Main dataset
                    'Raw_' + descriptors[0].replace('-', '_'),
                    # Name of main dataset
                    descriptors[0],
                    # Physical quantity contained in Main dataset
                    descriptors[2],  # Units for the physical quantity
                    h5_pos_inds=h5_pos_inds,
                    h5_pos_vals=h5_pos_vals,
                    # Position dimensions
                    pos_dims=pos_dims,
                    # Spectroscopic dimensions
                    h5_spec_inds=h5_spec_inds,
                    h5_spec_vals=h5_spec_vals,
                    spec_dims=spec_dims,
                    dtype=np.float32,  # data type / precision
                    main_dset_attrs={
                        'Caption': descriptors[0],
                        'Scale': descriptors[1],
                        'Physical_Units': descriptors[2],
                        'Offset': descriptors[3]
                    })
                h5_raw[:, :] = self.imgs[img_f].reshape(h5_raw.shape)
                if write_pos_vals:
                    h5_raw.h5_pos_vals[:, :] = self.pos_val
Example #24
0
def rebuild_svd(h5_main, components=None, cores=None, max_RAM_mb=1024):
    """
    Rebuild the Image from the SVD results on the windows
    Optionally, only use components less than n_comp.

    :param h5_main: dataset which SVD was performed on
    :type h5_main: hdf5 Dataset
    
    :param components: 
        Defines which components to keep
        Default - None, all components kept

        Input Types
        integer : Components less than the input will be kept
        length 2 iterable of integers : Integers define start and stop of component slice to retain
        other iterable of integers or slice : Selection of component indices to retain
    :type components: {int, iterable of int, slice} optional

    :param cores: How many cores should be used to rebuild
        Default - None, all but 2 cores will be used, min 1
    :type cores: int, optional
    
    :param max_RAM_mb: Maximum ammount of memory to use when rebuilding, in Mb.
        Default - 1024Mb
    :type max_RAM_mb: int, optional
    
    :raise: KeyError if SVD results not found 

    :returns: rebu dataset
    :rtype: HDF5 Dataset

    """

    if not isinstance(h5_main, USIDataset):
        h5_main = USIDataset(h5_main)

    comp_slice, num_comps = get_component_slice(
        components, total_components=h5_main.shape[1])
    if isinstance(comp_slice, np.ndarray):
        comp_slice = list(comp_slice)
    dset_name = h5_main.name.split('/')[-1]

    # Ensuring that at least one core is available for use / 2 cores are available for other use
    max_cores = max(1, cpu_count() - 2)
    #         print('max_cores',max_cores)
    if cores is not None:
        cores = min(round(abs(cores)), max_cores)
    else:
        cores = max_cores

    max_memory = min(max_RAM_mb * 1024**2, 0.75 * get_available_memory())
    if cores != 1:
        max_memory = int(max_memory / 2)
    '''
    Get the handles for the SVD results
    '''
    try:
        h5_svd_group = find_results_groups(h5_main, 'SVD')[-1]

        h5_S = h5_svd_group['S']
        h5_U = h5_svd_group['U']
        h5_V = h5_svd_group['V']

    except KeyError:
        raise KeyError(
            'SVD Results for {dset} were not found.'.format(dset=dset_name))
    except:
        raise

    func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_V)
    '''
    Calculate the size of a single batch that will fit in the available memory
    '''
    n_comps = h5_S[comp_slice].size
    mem_per_pix = (h5_U.dtype.itemsize +
                   h5_V.dtype.itemsize * h5_V.shape[1]) * n_comps
    fixed_mem = h5_main.size * h5_main.dtype.itemsize

    if cores is None:
        free_mem = max_memory - fixed_mem
    else:
        free_mem = max_memory * 2 - fixed_mem

    batch_size = int(round(float(free_mem) / mem_per_pix))

    if batch_size < 0:
        print('Batches listed were negative', batch_size)
        batch_size = 100

    batch_slices = gen_batches(h5_U.shape[0], batch_size)

    print('Reconstructing in batches of {} positions.'.format(batch_size))
    print('Batches should be {} Mb each.'.format(mem_per_pix * batch_size /
                                                 1024.0**2))
    '''
    Loop over all batches.
    '''
    ds_V = np.dot(np.diag(h5_S[comp_slice]), func(h5_V[comp_slice, :]))
    rebuild = np.zeros((h5_main.shape[0], ds_V.shape[1]))
    for ibatch, batch in enumerate(batch_slices):
        rebuild[batch, :] += np.dot(h5_U[batch, comp_slice], ds_V)

    rebuild = stack_real_to_target_dtype(rebuild, h5_V.dtype)

    print(
        'Completed reconstruction of data from SVD results.  Writing to file.')
    '''
    Create the Group and dataset to hold the rebuild data
    '''
    rebuilt_grp = create_indexed_group(h5_svd_group, 'Rebuilt_Data')
    h5_rebuilt = write_main_dataset(rebuilt_grp,
                                    rebuild,
                                    'Rebuilt_Data',
                                    get_attr(h5_main, 'quantity'),
                                    get_attr(h5_main, 'units'),
                                    None,
                                    None,
                                    h5_pos_inds=h5_main.h5_pos_inds,
                                    h5_pos_vals=h5_main.h5_pos_vals,
                                    h5_spec_inds=h5_main.h5_spec_inds,
                                    h5_spec_vals=h5_main.h5_spec_vals,
                                    chunks=h5_main.chunks,
                                    compression=h5_main.compression)

    if isinstance(comp_slice, slice):
        rebuilt_grp.attrs['components_used'] = '{}-{}'.format(
            comp_slice.start, comp_slice.stop)
    else:
        rebuilt_grp.attrs['components_used'] = components

    copy_attributes(h5_main, h5_rebuilt, skip_refs=False)

    h5_main.file.flush()

    print('Done writing reconstructed data to file.')

    return h5_rebuilt
Example #25
0
class SVD(Process):
    """
    This class provides a file-wrapper around the :meth:`sklearn.utils.extmath.randomized_svd` function.
    In other words, it extracts and then reformats the data present in the provided :class:`pyUSID.USIDataset` object,
    performs the randomized SVD operation and writes the results back to the USID HDF5 file after
    formatting the results in an USID compliant manner.
    """
    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.
        
        h5_target_group : h5py.Group, optional. Default = None
            Location where to look for existing results and to place newly
            computed results. Use this kwarg if the results need to be written
            to a different HDF5 file. By default, this value is set to the
            parent group containing `h5_main`
            
        :param h5_main: USID Main HDF5 dataset that will be decomposed
        :type h5_main: :class:`pyUSID.USIDataset` object
        
        :param num_components: Number of components to decompose h5_main into.  Default None.
        :type num_components: int, optional
        
        :param kwargs: Arguments to be sent to Process
        :type kwargs:
            
        """
        super(SVD, self).__init__(h5_main, 'SVD', **kwargs)
        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(
            h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None

    def test(self, override=False):
        """
        Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to
        write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the
        input matrix.

        :param override: Set to true to recompute results if prior results are available. Else, returns existing results
        :type override: bool, optional. default = False
            
        :returns: tuple (u_mat, self.__s, v_mat)
            WHERE
            numpy.ndarray u_mat is abundance matrix
            numpy.ndarray self.__s is variance vector
            numpy.ndarray v_mat is eigenvector matrix
        """
        '''
        Check if a number of compnents has been set and ensure that the number is less than
        the minimum axis length of the data.  If both conditions are met, use fsvd.  If not
        use the regular svd.

        C.Smith -- We might need to put a lower limit on num_comps in the future.  I don't
                   know enough about svd to be sure.
        '''
        if not override:
            if isinstance(self.duplicate_h5_groups,
                          list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(
                    self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \
                       reshape_to_n_dims(self.h5_results_grp['V'])[0]

        self.h5_results_grp = None

        t1 = time.time()

        self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func(
            self.h5_main),
                                                      self.num_components,
                                                      n_iter=3)
        self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype)

        print('Took {} to compute randomized SVD'.format(
            format_time(time.time() - t1)))

        u_mat, success = reshape_to_n_dims(self.__u,
                                           h5_pos=self.h5_main.h5_pos_inds,
                                           h5_spec=np.expand_dims(np.arange(
                                               self.__u.shape[1]),
                                                                  axis=0))
        if not success:
            raise ValueError(
                'Could not reshape U to N-Dimensional dataset! Error:' +
                success)

        # When the source dataset has a singular valued spectroscopic dimension
        # stack_real_to_target causes V to lose all its dimensions
        if self.__v.ndim == 0:
            # However, we want V to be 2D:
            self.__v = np.atleast_2d(self.__v)

        v_mat, success = reshape_to_n_dims(self.__v,
                                           h5_pos=np.expand_dims(np.arange(
                                               self.__u.shape[1]),
                                                                 axis=1),
                                           h5_spec=self.h5_main.h5_spec_inds)
        if not success:
            raise ValueError(
                'Could not reshape V to N-Dimensional dataset! Error:' +
                success)

        return u_mat, self.__s, v_mat

    def compute(self, override=False):
        """
        Computes SVD (by calling test_on_subset() if it has not already been called) and writes results to file.
        Consider calling test() to check results before writing to file. Results are deleted from memory
        upon writing to the HDF5 file

        :param override: Set to true to recompute results if prior results are available. Else, returns existing results
        :type override : bool, optional. default = False
            
        :returns: HDF5 Group containing all the results
        :rtype: h5py.Group object
            
        """
        if self.__u is None and self.__v is None and self.__s is None:
            self.test(override=override)

        if self.h5_results_grp is None:
            self._write_results_chunk()
            self.delete_results()

        h5_group = self.h5_results_grp

        return h5_group

    def delete_results(self):
        """
        Deletes results from memory.
        """
        del self.__u, self.__s, self.__v
        self.__u = None
        self.__v = None
        self.__s = None

    def _write_results_chunk(self):
        """
        Writes the provided SVD results to file

        Parameters
        ----------
        """
        comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s))

        h5_svd_group = create_results_group(
            self.h5_main,
            self.process_name,
            h5_parent_group=self._h5_target_group)
        self.h5_results_grp = h5_svd_group
        self._write_source_dset_provenance()

        write_simple_attrs(h5_svd_group, self.parms_dict)
        write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'})

        h5_u = write_main_dataset(h5_svd_group,
                                  np.float32(self.__u),
                                  'U',
                                  'Abundance',
                                  'a.u.',
                                  None,
                                  comp_dim,
                                  h5_pos_inds=self.h5_main.h5_pos_inds,
                                  h5_pos_vals=self.h5_main.h5_pos_vals,
                                  dtype=np.float32,
                                  chunks=calc_chunks(self.__u.shape,
                                                     np.float32(0).itemsize))
        # print(get_attr(self.h5_main, 'quantity')[0])
        h5_v = write_main_dataset(h5_svd_group,
                                  self.__v,
                                  'V',
                                  get_attr(self.h5_main, 'quantity')[0],
                                  'a.u.',
                                  comp_dim,
                                  None,
                                  h5_spec_inds=self.h5_main.h5_spec_inds,
                                  h5_spec_vals=self.h5_main.h5_spec_vals,
                                  chunks=calc_chunks(
                                      self.__v.shape,
                                      self.h5_main.dtype.itemsize))

        # No point making this 1D dataset a main dataset
        h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s))
        '''
        Check h5_main for plot group references.
        Copy them into V if they exist
        '''
        for key in self.h5_main.attrs.keys():
            if '_Plot_Group' not in key:
                continue

            ref_inds = get_indices_for_region_ref(self.h5_main,
                                                  self.h5_main.attrs[key],
                                                  return_method='corners')
            ref_inds = ref_inds.reshape([-1, 2, 2])
            ref_inds[:, 1, 0] = h5_v.shape[0] - 1

            svd_ref = create_region_reference(h5_v, ref_inds)

            h5_v.attrs[key] = svd_ref

        # Marking completion:
        self._status_dset_name = 'completed_positions'
        self._h5_status_dset = h5_svd_group.create_dataset(
            self._status_dset_name,
            data=np.ones(self.h5_main.shape[0], dtype=np.uint8))
        # keeping legacy option:
        h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0]

    def _check_available_mem(self):
        """
        Check that there is enough memory to perform the SVD decomposition.
        
        :raise: MemoryError if not enough memory found
        
        :returns: True is enough memory found, False otherwise.
        :rtype: bool
            

        """
        if self.verbose:
            print('Checking memory availability.')
        n_samples, n_features = self.h5_main.shape
        s_mem_per_comp = np.float32(0).itemsize
        u_mem_per_comp = np.float32(0).itemsize * n_samples
        v_mem_per_comp = self.h5_main.dtype.itemsize * n_features

        mem_per_comp = s_mem_per_comp + u_mem_per_comp + v_mem_per_comp
        max_mem = get_available_memory()
        avail_mem = 0.75 * max_mem
        free_mem = avail_mem - self.h5_main.__sizeof__()

        if free_mem <= 0:
            error_message = 'Cannot load main dataset into memory.\n' + \
                            'Available memory is {}.  Dataset needs {}.'.format(avail_mem,
                                                                                self.h5_main.__sizeof__())
            raise MemoryError(error_message)

        if self.verbose:
            print('Memory available for SVD is {}.'.format(free_mem))
            print('Memory needed per component is {}.'.format(mem_per_comp))

        cant_svd = (free_mem - self.num_components * mem_per_comp) <= 0

        if cant_svd:
            max_comps = np.floor(free_mem / mem_per_comp, dtype=int)
            error_message = 'Not enough free memory for performing SVD with requested number of parameters.\n' + \
                            'Maximum possible parameters is {}.'.format(max_comps)
            raise MemoryError(error_message)
Example #26
0
class SVD(Process):
    """
    This class provides a file-wrapper around the :meth:`sklearn.utils.extmath.randomized_svd` function.
    In other words, it extracts and then reformats the data present in the provided :class:`pyUSID.USIDataset` object,
    performs the randomized SVD operation and writes the results back to the USID HDF5 file after
    formatting the results in an USID compliant manner.
    """

    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.

        Parameters
        ----------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset that will be decomposed
        num_components : int, optional
            Number of components to decompose h5_main into.  Default None.
        kwargs
            Arguments to be sent to Process
        """
        super(SVD, self).__init__(h5_main, **kwargs)
        self.process_name = 'SVD'

        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates()

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None

    def test(self, override=False):
        """
        Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to
        write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the
        input matrix.

        Parameters
        ----------
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
        U : :class:`numpy.ndarray`
            Abundance matrix
        S : :class:`numpy.ndarray`
            variance vector
        V : :class:`numpy.ndarray`
            eigenvector matrix
        """
        '''
        Check if a number of compnents has been set and ensure that the number is less than
        the minimum axis length of the data.  If both conditions are met, use fsvd.  If not
        use the regular svd.

        C.Smith -- We might need to put a lower limit on num_comps in the future.  I don't
                   know enough about svd to be sure.
        '''
        if not override:
            if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0:
                self.h5_results_grp = self.duplicate_h5_groups[-1]
                print('Returning previously computed results from: {}'.format(self.h5_results_grp.name))
                print('set the "override" flag to True to recompute results')
                return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \
                       reshape_to_n_dims(self.h5_results_grp['V'])[0]

        self.h5_results_grp = None

        t1 = time.time()

        self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func(self.h5_main), self.num_components,
                                                      n_iter=3)
        self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype)

        print('Took {} to compute randomized SVD'.format(format_time(time.time() - t1)))

        u_mat, success = reshape_to_n_dims(self.__u, h5_pos=self.h5_main.h5_pos_inds,
                                           h5_spec=np.expand_dims(np.arange(self.__u.shape[1]), axis=0))
        if not success:
            raise ValueError('Could not reshape U to N-Dimensional dataset! Error:' + success)

        v_mat, success = reshape_to_n_dims(self.__v, h5_pos=np.expand_dims(np.arange(self.__u.shape[1]), axis=1),
                                           h5_spec=self.h5_main.h5_spec_inds)
        if not success:
            raise ValueError('Could not reshape V to N-Dimensional dataset! Error:' + success)

        return u_mat, self.__s, v_mat

    def compute(self, override=False):
        """
        Computes SVD (by calling test_on_subset() if it has not already been called) and writes results to file.
        Consider calling test() to check results before writing to file. Results are deleted from memory
        upon writing to the HDF5 file

        Parameters
        ----------
        override : bool, optional. default = False
            Set to true to recompute results if prior results are available. Else, returns existing results

        Returns
        -------
         h5_results_grp : :class:`h5py.Group`  object
            HDF5 Group containing all the results
        """
        if self.__u is None and self.__v is None and self.__s is None:
            self.test(override=override)

        if self.h5_results_grp is None:
            self._write_results_chunk()
            self.delete_results()

        h5_group = self.h5_results_grp

        return h5_group

    def delete_results(self):
        """
        Deletes results from memory.
        """
        del self.__u, self.__s, self.__v
        self.__u = None
        self.__v = None
        self.__s = None

    def _write_results_chunk(self):
        """
        Writes the provided SVD results to file

        Parameters
        ----------
        """
        comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s))

        h5_svd_group = create_results_group(self.h5_main, self.process_name)
        self.h5_results_grp = h5_svd_group

        write_simple_attrs(h5_svd_group, self.parms_dict)
        write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'})

        h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim,
                                  h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals,
                                  dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize))
        # print(get_attr(self.h5_main, 'quantity')[0])
        h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0],
                                  'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds,
                                  h5_spec_vals=self.h5_main.h5_spec_vals,
                                  chunks=calc_chunks(self.__v.shape, self.h5_main.dtype.itemsize))

        # No point making this 1D dataset a main dataset
        h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s))

        '''
        Check h5_main for plot group references.
        Copy them into V if they exist
        '''
        for key in self.h5_main.attrs.keys():
            if '_Plot_Group' not in key:
                continue

            ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners')
            ref_inds = ref_inds.reshape([-1, 2, 2])
            ref_inds[:, 1, 0] = h5_v.shape[0] - 1

            svd_ref = create_region_reference(h5_v, ref_inds)

            h5_v.attrs[key] = svd_ref

        # Marking completion:
        self._status_dset_name = 'completed_positions'
        self._h5_status_dset = h5_svd_group.create_dataset(self._status_dset_name,
                                                           data=np.ones(self.h5_main.shape[0], dtype=np.uint8))
        # keeping legacy option:
        h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0]

    def _check_available_mem(self):
        """
        Check that there is enough memory to perform the SVD decomposition.

        Returns
        -------
        sufficient_mem : bool
            True is enough memory found, False otherwise.

        """
        if self.verbose:
            print('Checking memory availability.')
        n_samples, n_features = self.h5_main.shape
        s_mem_per_comp = np.float32(0).itemsize
        u_mem_per_comp = np.float32(0).itemsize * n_samples
        v_mem_per_comp = self.h5_main.dtype.itemsize * n_features

        mem_per_comp = s_mem_per_comp + u_mem_per_comp + v_mem_per_comp
        avail_mem = 0.75 * self._max_mem_mb * 1024 ** 2
        free_mem = avail_mem - self.h5_main.__sizeof__()

        if free_mem <= 0:
            error_message = 'Cannot load main dataset into memory.\n' + \
                            'Available memory is {}.  Dataset needs {}.'.format(avail_mem,
                                                                                self.h5_main.__sizeof__())
            raise MemoryError()

        if self.verbose:
            print('Memory available for SVD is {}.'.format(free_mem))
            print('Memory needed per component is {}.'.format(mem_per_comp))

        cant_svd = (free_mem - self.num_components * mem_per_comp) <= 0

        if cant_svd:
            max_comps = np.floor(free_mem / mem_per_comp, dtype=int)
            error_message = 'Not enough free memory for performing SVD with requested number of parameters.\n' + \
                            'Maximum possible parameters is {}.'.format(max_comps)
            raise MemoryError(error_message)