class SVD(Process): """ This class provides a file-wrapper around the :meth:`sklearn.utils.extmath.randomized_svd` function. In other words, it extracts and then reformats the data present in the provided :class:`pyUSID.USIDataset` object, performs the randomized SVD operation and writes the results back to the USID HDF5 file after formatting the results in an USID compliant manner. """ def __init__(self, h5_main, num_components=None, **kwargs): """ Perform the SVD decomposition on the selected dataset and write the results to h5 file. h5_target_group : h5py.Group, optional. Default = None Location where to look for existing results and to place newly computed results. Use this kwarg if the results need to be written to a different HDF5 file. By default, this value is set to the parent group containing `h5_main` :param h5_main: USID Main HDF5 dataset that will be decomposed :type h5_main: :class:`pyUSID.USIDataset` object :param num_components: Number of components to decompose h5_main into. Default None. :type num_components: int, optional :param kwargs: Arguments to be sent to Process :type kwargs: """ super(SVD, self).__init__(h5_main, 'SVD', **kwargs) ''' Calculate the size of the main data in memory and compare to max_mem We use the minimum of the actual dtype's itemsize and float32 since we don't want to read it in yet and do the proper type conversions. ''' n_samples, n_features = h5_main.shape self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype( h5_main) if num_components is None: num_components = min(n_samples, n_features) else: num_components = min(n_samples, n_features, num_components) self.num_components = num_components # Check that we can actually compute the SVD with the selected number of components self._check_available_mem() self.parms_dict = {'num_components': num_components} self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates( ) # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__u = None self.__v = None self.__s = None def test(self, override=False): """ Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the input matrix. :param override: Set to true to recompute results if prior results are available. Else, returns existing results :type override: bool, optional. default = False :returns: tuple (u_mat, self.__s, v_mat) WHERE numpy.ndarray u_mat is abundance matrix numpy.ndarray self.__s is variance vector numpy.ndarray v_mat is eigenvector matrix """ ''' Check if a number of compnents has been set and ensure that the number is less than the minimum axis length of the data. If both conditions are met, use fsvd. If not use the regular svd. C.Smith -- We might need to put a lower limit on num_comps in the future. I don't know enough about svd to be sure. ''' if not override: if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0: self.h5_results_grp = self.duplicate_h5_groups[-1] print('Returning previously computed results from: {}'.format( self.h5_results_grp.name)) print('set the "override" flag to True to recompute results') return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \ reshape_to_n_dims(self.h5_results_grp['V'])[0] self.h5_results_grp = None t1 = time.time() self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func( self.h5_main), self.num_components, n_iter=3) self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype) print('Took {} to compute randomized SVD'.format( format_time(time.time() - t1))) u_mat, success = reshape_to_n_dims(self.__u, h5_pos=self.h5_main.h5_pos_inds, h5_spec=np.expand_dims(np.arange( self.__u.shape[1]), axis=0)) if not success: raise ValueError( 'Could not reshape U to N-Dimensional dataset! Error:' + success) # When the source dataset has a singular valued spectroscopic dimension # stack_real_to_target causes V to lose all its dimensions if self.__v.ndim == 0: # However, we want V to be 2D: self.__v = np.atleast_2d(self.__v) v_mat, success = reshape_to_n_dims(self.__v, h5_pos=np.expand_dims(np.arange( self.__u.shape[1]), axis=1), h5_spec=self.h5_main.h5_spec_inds) if not success: raise ValueError( 'Could not reshape V to N-Dimensional dataset! Error:' + success) return u_mat, self.__s, v_mat def compute(self, override=False): """ Computes SVD (by calling test_on_subset() if it has not already been called) and writes results to file. Consider calling test() to check results before writing to file. Results are deleted from memory upon writing to the HDF5 file :param override: Set to true to recompute results if prior results are available. Else, returns existing results :type override : bool, optional. default = False :returns: HDF5 Group containing all the results :rtype: h5py.Group object """ if self.__u is None and self.__v is None and self.__s is None: self.test(override=override) if self.h5_results_grp is None: self._write_results_chunk() self.delete_results() h5_group = self.h5_results_grp return h5_group def delete_results(self): """ Deletes results from memory. """ del self.__u, self.__s, self.__v self.__u = None self.__v = None self.__s = None def _write_results_chunk(self): """ Writes the provided SVD results to file Parameters ---------- """ comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s)) h5_svd_group = create_results_group( self.h5_main, self.process_name, h5_parent_group=self._h5_target_group) self.h5_results_grp = h5_svd_group self._write_source_dset_provenance() write_simple_attrs(h5_svd_group, self.parms_dict) write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'}) h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize)) # print(get_attr(self.h5_main, 'quantity')[0]) h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0], 'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals, chunks=calc_chunks( self.__v.shape, self.h5_main.dtype.itemsize)) # No point making this 1D dataset a main dataset h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s)) ''' Check h5_main for plot group references. Copy them into V if they exist ''' for key in self.h5_main.attrs.keys(): if '_Plot_Group' not in key: continue ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners') ref_inds = ref_inds.reshape([-1, 2, 2]) ref_inds[:, 1, 0] = h5_v.shape[0] - 1 svd_ref = create_region_reference(h5_v, ref_inds) h5_v.attrs[key] = svd_ref # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_svd_group.create_dataset( self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0] def _check_available_mem(self): """ Check that there is enough memory to perform the SVD decomposition. :raise: MemoryError if not enough memory found :returns: True is enough memory found, False otherwise. :rtype: bool """ if self.verbose: print('Checking memory availability.') n_samples, n_features = self.h5_main.shape s_mem_per_comp = np.float32(0).itemsize u_mem_per_comp = np.float32(0).itemsize * n_samples v_mem_per_comp = self.h5_main.dtype.itemsize * n_features mem_per_comp = s_mem_per_comp + u_mem_per_comp + v_mem_per_comp max_mem = get_available_memory() avail_mem = 0.75 * max_mem free_mem = avail_mem - self.h5_main.__sizeof__() if free_mem <= 0: error_message = 'Cannot load main dataset into memory.\n' + \ 'Available memory is {}. Dataset needs {}.'.format(avail_mem, self.h5_main.__sizeof__()) raise MemoryError(error_message) if self.verbose: print('Memory available for SVD is {}.'.format(free_mem)) print('Memory needed per component is {}.'.format(mem_per_comp)) cant_svd = (free_mem - self.num_components * mem_per_comp) <= 0 if cant_svd: max_comps = np.floor(free_mem / mem_per_comp, dtype=int) error_message = 'Not enough free memory for performing SVD with requested number of parameters.\n' + \ 'Maximum possible parameters is {}.'.format(max_comps) raise MemoryError(error_message)
class SVD(Process): def __init__(self, h5_main, num_components=None, **kwargs): """ Perform the SVD decomposition on the selected dataset and write the results to h5 file. Parameters ---------- h5_main : USIDataset Dataset to be decomposed. num_components : int, optional Number of components to decompose h5_main into. Default None. kwargs Arguments to be sent to Process """ super(SVD, self).__init__(h5_main, **kwargs) self.process_name = 'SVD' ''' Calculate the size of the main data in memory and compare to max_mem We use the minimum of the actual dtype's itemsize and float32 since we don't want to read it in yet and do the proper type conversions. ''' n_samples, n_features = h5_main.shape self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype( h5_main) if num_components is None: num_components = min(n_samples, n_features) else: num_components = min(n_samples, n_features, num_components) self.num_components = num_components # Check that we can actually compute the SVD with the selected number of components self._check_available_mem() self.parms_dict = {'num_components': num_components} self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates( ) # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__u = None self.__v = None self.__s = None def test(self, override=False): """ Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the input matrix. Parameters ---------- override : bool, optional. default = False Set to true to recompute results if prior results are available. Else, returns existing results Returns ------- U : numpy.ndarray Abundance matrix S : numpy.ndarray variance vector V : numpy.ndarray eigenvector matrix """ ''' Check if a number of compnents has been set and ensure that the number is less than the minimum axis length of the data. If both conditions are met, use fsvd. If not use the regular svd. C.Smith -- We might need to put a lower limit on num_comps in the future. I don't know enough about svd to be sure. ''' if not override: if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0: self.h5_results_grp = self.duplicate_h5_groups[-1] print('Returning previously computed results from: {}'.format( self.h5_results_grp.name)) print('set the "override" flag to True to recompute results') return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \ reshape_to_n_dims(self.h5_results_grp['V'])[0] self.h5_results_grp = None t1 = time.time() self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func( self.h5_main), self.num_components, n_iter=3) self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype) print('Took {} to compute randomized SVD'.format( format_time(time.time() - t1))) u_mat, success = reshape_to_n_dims(self.__u, h5_pos=self.h5_main.h5_pos_inds, h5_spec=np.expand_dims(np.arange( self.__u.shape[1]), axis=0)) if not success: raise ValueError( 'Could not reshape U to N-Dimensional dataset! Error:' + success) v_mat, success = reshape_to_n_dims(self.__v, h5_pos=np.expand_dims(np.arange( self.__u.shape[1]), axis=1), h5_spec=self.h5_main.h5_spec_inds) if not success: raise ValueError( 'Could not reshape V to N-Dimensional dataset! Error:' + success) return u_mat, self.__s, v_mat def compute(self, override=False): """ Computes SVD (by calling test_on_subset() if it has not already been called) and writes results to file. Consider calling test() to check results before writing to file. Results are deleted from memory upon writing to the HDF5 file Parameters ---------- override : bool, optional. default = False Set to true to recompute results if prior results are available. Else, returns existing results Returns ------- h5_results_grp : h5py.Datagroup object Datagroup containing all the results """ if self.__u is None and self.__v is None and self.__s is None: self.test(override=override) if self.h5_results_grp is None: self._write_results_chunk() self.delete_results() h5_group = self.h5_results_grp return h5_group def delete_results(self): """ Deletes results from memory. """ del self.__u, self.__s, self.__v self.__u = None self.__v = None self.__s = None def _write_results_chunk(self): """ Writes the provided SVD results to file Parameters ---------- """ comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s)) h5_svd_group = create_results_group(self.h5_main, self.process_name) self.h5_results_grp = h5_svd_group write_simple_attrs(h5_svd_group, self.parms_dict) write_simple_attrs( h5_svd_group, { 'svd_method': 'sklearn-randomized', 'last_pixel': self.h5_main.shape[0] }) h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize)) # print(get_attr(self.h5_main, 'quantity')[0]) h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0], 'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals, chunks=calc_chunks( self.__v.shape, self.h5_main.dtype.itemsize)) # No point making this 1D dataset a main dataset h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s)) ''' Check h5_main for plot group references. Copy them into V if they exist ''' for key in self.h5_main.attrs.keys(): if '_Plot_Group' not in key: continue ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners') ref_inds = ref_inds.reshape([-1, 2, 2]) ref_inds[:, 1, 0] = h5_v.shape[0] - 1 svd_ref = create_region_reference(h5_v, ref_inds) h5_v.attrs[key] = svd_ref def _check_available_mem(self): """ Check that there is enough memory to perform the SVD decomposition. Returns ------- sufficient_mem : bool True is enough memory found, False otherwise. """ if self.verbose: print('Checking memory availability.') n_samples, n_features = self.h5_main.shape s_mem_per_comp = np.float32(0).itemsize u_mem_per_comp = np.float32(0).itemsize * n_samples v_mem_per_comp = self.h5_main.dtype.itemsize * n_features mem_per_comp = s_mem_per_comp + u_mem_per_comp + v_mem_per_comp avail_mem = 0.75 * self._max_mem_mb * 1024**2 free_mem = avail_mem - self.h5_main.__sizeof__() if free_mem <= 0: error_message = 'Cannot load main dataset into memory.\n' + \ 'Available memory is {}. Dataset needs {}.'.format(avail_mem, self.h5_main.__sizeof__()) raise MemoryError() if self.verbose: print('Memory available for SVD is {}.'.format(free_mem)) print('Memory needed per component is {}.'.format(mem_per_comp)) cant_svd = (free_mem - self.num_components * mem_per_comp) <= 0 if cant_svd: max_comps = np.floor(free_mem / mem_per_comp, dtype=int) error_message = 'Not enough free memory for performing SVD with requested number of parameters.\n' + \ 'Maximum possible parameters is {}.'.format(max_comps) raise MemoryError(error_message)
class SVD(Process): """ This class provides a file-wrapper around the :meth:`sklearn.utils.extmath.randomized_svd` function. In other words, it extracts and then reformats the data present in the provided :class:`pyUSID.USIDataset` object, performs the randomized SVD operation and writes the results back to the USID HDF5 file after formatting the results in an USID compliant manner. """ def __init__(self, h5_main, num_components=None, **kwargs): """ Perform the SVD decomposition on the selected dataset and write the results to h5 file. Parameters ---------- h5_main : :class:`pyUSID.USIDataset` object USID Main HDF5 dataset that will be decomposed num_components : int, optional Number of components to decompose h5_main into. Default None. kwargs Arguments to be sent to Process """ super(SVD, self).__init__(h5_main, **kwargs) self.process_name = 'SVD' ''' Calculate the size of the main data in memory and compare to max_mem We use the minimum of the actual dtype's itemsize and float32 since we don't want to read it in yet and do the proper type conversions. ''' n_samples, n_features = h5_main.shape self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_main) if num_components is None: num_components = min(n_samples, n_features) else: num_components = min(n_samples, n_features, num_components) self.num_components = num_components # Check that we can actually compute the SVD with the selected number of components self._check_available_mem() self.parms_dict = {'num_components': num_components} self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates() # supercharge h5_main! self.h5_main = USIDataset(self.h5_main) self.__u = None self.__v = None self.__s = None def test(self, override=False): """ Applies randomised VD to the dataset. This function does NOT write results to the hdf5 file. Call compute() to write to the file. Handles complex, compound datasets such that the V matrix is of the same data-type as the input matrix. Parameters ---------- override : bool, optional. default = False Set to true to recompute results if prior results are available. Else, returns existing results Returns ------- U : :class:`numpy.ndarray` Abundance matrix S : :class:`numpy.ndarray` variance vector V : :class:`numpy.ndarray` eigenvector matrix """ ''' Check if a number of compnents has been set and ensure that the number is less than the minimum axis length of the data. If both conditions are met, use fsvd. If not use the regular svd. C.Smith -- We might need to put a lower limit on num_comps in the future. I don't know enough about svd to be sure. ''' if not override: if isinstance(self.duplicate_h5_groups, list) and len(self.duplicate_h5_groups) > 0: self.h5_results_grp = self.duplicate_h5_groups[-1] print('Returning previously computed results from: {}'.format(self.h5_results_grp.name)) print('set the "override" flag to True to recompute results') return reshape_to_n_dims(self.h5_results_grp['U'])[0], self.h5_results_grp['S'][()], \ reshape_to_n_dims(self.h5_results_grp['V'])[0] self.h5_results_grp = None t1 = time.time() self.__u, self.__s, self.__v = randomized_svd(self.data_transform_func(self.h5_main), self.num_components, n_iter=3) self.__v = stack_real_to_target_dtype(self.__v, self.h5_main.dtype) print('Took {} to compute randomized SVD'.format(format_time(time.time() - t1))) u_mat, success = reshape_to_n_dims(self.__u, h5_pos=self.h5_main.h5_pos_inds, h5_spec=np.expand_dims(np.arange(self.__u.shape[1]), axis=0)) if not success: raise ValueError('Could not reshape U to N-Dimensional dataset! Error:' + success) v_mat, success = reshape_to_n_dims(self.__v, h5_pos=np.expand_dims(np.arange(self.__u.shape[1]), axis=1), h5_spec=self.h5_main.h5_spec_inds) if not success: raise ValueError('Could not reshape V to N-Dimensional dataset! Error:' + success) return u_mat, self.__s, v_mat def compute(self, override=False): """ Computes SVD (by calling test_on_subset() if it has not already been called) and writes results to file. Consider calling test() to check results before writing to file. Results are deleted from memory upon writing to the HDF5 file Parameters ---------- override : bool, optional. default = False Set to true to recompute results if prior results are available. Else, returns existing results Returns ------- h5_results_grp : :class:`h5py.Group` object HDF5 Group containing all the results """ if self.__u is None and self.__v is None and self.__s is None: self.test(override=override) if self.h5_results_grp is None: self._write_results_chunk() self.delete_results() h5_group = self.h5_results_grp return h5_group def delete_results(self): """ Deletes results from memory. """ del self.__u, self.__s, self.__v self.__u = None self.__v = None self.__s = None def _write_results_chunk(self): """ Writes the provided SVD results to file Parameters ---------- """ comp_dim = Dimension('Principal Component', 'a. u.', len(self.__s)) h5_svd_group = create_results_group(self.h5_main, self.process_name) self.h5_results_grp = h5_svd_group write_simple_attrs(h5_svd_group, self.parms_dict) write_simple_attrs(h5_svd_group, {'svd_method': 'sklearn-randomized'}) h5_u = write_main_dataset(h5_svd_group, np.float32(self.__u), 'U', 'Abundance', 'a.u.', None, comp_dim, h5_pos_inds=self.h5_main.h5_pos_inds, h5_pos_vals=self.h5_main.h5_pos_vals, dtype=np.float32, chunks=calc_chunks(self.__u.shape, np.float32(0).itemsize)) # print(get_attr(self.h5_main, 'quantity')[0]) h5_v = write_main_dataset(h5_svd_group, self.__v, 'V', get_attr(self.h5_main, 'quantity')[0], 'a.u.', comp_dim, None, h5_spec_inds=self.h5_main.h5_spec_inds, h5_spec_vals=self.h5_main.h5_spec_vals, chunks=calc_chunks(self.__v.shape, self.h5_main.dtype.itemsize)) # No point making this 1D dataset a main dataset h5_s = h5_svd_group.create_dataset('S', data=np.float32(self.__s)) ''' Check h5_main for plot group references. Copy them into V if they exist ''' for key in self.h5_main.attrs.keys(): if '_Plot_Group' not in key: continue ref_inds = get_indices_for_region_ref(self.h5_main, self.h5_main.attrs[key], return_method='corners') ref_inds = ref_inds.reshape([-1, 2, 2]) ref_inds[:, 1, 0] = h5_v.shape[0] - 1 svd_ref = create_region_reference(h5_v, ref_inds) h5_v.attrs[key] = svd_ref # Marking completion: self._status_dset_name = 'completed_positions' self._h5_status_dset = h5_svd_group.create_dataset(self._status_dset_name, data=np.ones(self.h5_main.shape[0], dtype=np.uint8)) # keeping legacy option: h5_svd_group.attrs['last_pixel'] = self.h5_main.shape[0] def _check_available_mem(self): """ Check that there is enough memory to perform the SVD decomposition. Returns ------- sufficient_mem : bool True is enough memory found, False otherwise. """ if self.verbose: print('Checking memory availability.') n_samples, n_features = self.h5_main.shape s_mem_per_comp = np.float32(0).itemsize u_mem_per_comp = np.float32(0).itemsize * n_samples v_mem_per_comp = self.h5_main.dtype.itemsize * n_features mem_per_comp = s_mem_per_comp + u_mem_per_comp + v_mem_per_comp avail_mem = 0.75 * self._max_mem_mb * 1024 ** 2 free_mem = avail_mem - self.h5_main.__sizeof__() if free_mem <= 0: error_message = 'Cannot load main dataset into memory.\n' + \ 'Available memory is {}. Dataset needs {}.'.format(avail_mem, self.h5_main.__sizeof__()) raise MemoryError() if self.verbose: print('Memory available for SVD is {}.'.format(free_mem)) print('Memory needed per component is {}.'.format(mem_per_comp)) cant_svd = (free_mem - self.num_components * mem_per_comp) <= 0 if cant_svd: max_comps = np.floor(free_mem / mem_per_comp, dtype=int) error_message = 'Not enough free memory for performing SVD with requested number of parameters.\n' + \ 'Maximum possible parameters is {}.'.format(max_comps) raise MemoryError(error_message)