Python check_dtype Exemples, pyUSID.io.dtype_utils.check_dtype Python Exemples

Exemple #1

0

Afficher le fichier

    def __init__(self, h5_main, num_components=None):

        super(SVD, self).__init__(h5_main)
        self.process_name = 'SVD'
        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(
            h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)
        self.num_components = num_components
        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None

Exemple #2

0

Afficher le fichier

Fichier : test_dtype_utils.py Projet : shunsunsun/pyUSID

 def test_compound_numpy(self):
     with h5py.File(file_path, mode='r') as h5_f:
         func, is_complex, is_compound, n_features, type_mult = dtype_utils.check_dtype(h5_f['compound'])
         self.assertEqual(func, dtype_utils.flatten_compound_to_real)
         self.assertEqual(is_complex, False)
         self.assertEqual(is_compound, True)
         self.assertEqual(n_features, 3 * h5_f['compound'].shape[1])
         self.assertEqual(type_mult, 3 * np.float32(0).itemsize)

Exemple #3

0

Afficher le fichier

    def __init__(self, h5_main, estimator, **kwargs):
        """
        Constructs the Decomposition object. Call the :meth:`~pycroscopy.processing.Decomposition.test()` and
        :meth:`~pycroscopy.processing.Decomposition.compute()` methods to run the decomposition
        
        Parameters
        ------------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset with embedded ancillary spectroscopic, position indices and values datasets
        estimator : :module:`sklearn.decomposition` object
            configured decomposition object to apply to the data
        h5_target_group : h5py.Group, optional. Default = None
            Location where to look for existing results and to place newly
            computed results. Use this kwarg if the results need to be written
            to a different HDF5 file. By default, this value is set to the
            parent group containing `h5_main`
        """
        
        allowed_methods = [dec.factor_analysis.FactorAnalysis,
                           dec.fastica_.FastICA,
                           dec.incremental_pca.IncrementalPCA,
                           dec.sparse_pca.MiniBatchSparsePCA,
                           dec.nmf.NMF,
                           dec.pca.PCA,
                           dec.sparse_pca.SparsePCA,
                           dec.truncated_svd.TruncatedSVD]
        
        # Store the decomposition object
        self.estimator = estimator
        
        # could not find a nicer way to extract the method name yet
        self.method_name = str(estimator)[:str(estimator).index('(')]

        if type(estimator) not in allowed_methods:
            raise NotImplementedError('Cannot work with {} yet'.format(self.method_name))
            
        # Done with decomposition-related checks, now call super init
        super(Decomposition, self).__init__(h5_main, 'Decomposition', **kwargs)
        
        # set up parameters
        self.parms_dict = {'decomposition_algorithm':self.method_name}
        self.parms_dict.update(self.estimator.get_params())
        
        # check for existing datagroups with same results
        # Partial groups don't make any sense for statistical learning algorithms....
        self.duplicate_h5_groups, self.h5_partial_groups = self._check_for_duplicates()

        # figure out the operation that needs need to be performed to convert to real scalar
        (self.data_transform_func, self.data_is_complex, self.data_is_compound,
         self.data_n_features, self.data_type_mult) = check_dtype(h5_main)

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)
        
        self.__components = None
        self.__projection = None

Exemple #4

0

Afficher le fichier

Fichier : test_dtype_utils.py Projet : shunsunsun/pyUSID

 def test_real_numpy(self):
     # real_matrix = np.random.rand(5, 7)
     # func, is_complex, is_compound, n_features, type_mult = dtype_utils.check_dtype(real_matrix)
     with h5py.File(file_path, mode='r') as h5_f:
         func, is_complex, is_compound, n_features, type_mult = dtype_utils.check_dtype(h5_f['real'])
         self.assertEqual(func, h5_f['real'].dtype.type)
         self.assertEqual(is_complex, False)
         self.assertEqual(is_compound, False)
         self.assertEqual(n_features, h5_f['real'].shape[1])
         self.assertEqual(type_mult, h5_f['real'].dtype.type(0).itemsize)

Exemple #5

0

Afficher le fichier

Fichier : test_dtype_utils.py Projet : c-aragon/pyUSID

 def test_check_dtype_complex_numpy(self):
     with h5py.File(file_path, mode='r') as h5_f:
         func, is_complex, is_compound, n_features, type_mult = dtype_utils.check_dtype(
             h5_f['complex'])
         self.assertEqual(func, dtype_utils.flatten_complex_to_real)
         self.assertEqual(is_complex, True)
         self.assertEqual(is_compound, False)
         self.assertEqual(n_features, 2 * h5_f['complex'].shape[1])
         self.assertEqual(type_mult,
                          2 * np.real(h5_f['complex'][0, 0]).dtype.itemsize)

Exemple #6

0

Afficher le fichier

Fichier : decomposition.py Projet : ruanyangry/pycroscopy

    def __init__(self, h5_main, estimator):
        """
        Uses the provided (preconfigured) Decomposition object to 
        decompose the provided dataset
        
        Parameters
        ------------
        h5_main : HDF5 dataset object
            Main dataset with ancillary spectroscopic, position indices and values datasets
        estimator : sklearn.cluster estimator object
            configured decomposition object to apply to the data
        """
        
        allowed_methods = [dec.factor_analysis.FactorAnalysis,
                           dec.fastica_.FastICA,
                           dec.incremental_pca.IncrementalPCA,
                           dec.sparse_pca.MiniBatchSparsePCA,
                           dec.nmf.NMF,
                           dec.pca.PCA,
                           dec.sparse_pca.SparsePCA,
                           dec.truncated_svd.TruncatedSVD]
        
        # Store the decomposition object
        self.estimator = estimator
        
        # could not find a nicer way to extract the method name yet
        self.method_name = str(estimator)[:str(estimator).index('(')]

        if type(estimator) not in allowed_methods:
            raise NotImplementedError('Cannot work with {} yet'.format(self.method_name))
            
        # Done with decomposition-related checks, now call super init
        super(Decomposition, self).__init__(h5_main)
        
        # set up parameters
        self.parms_dict = {'decomposition_algorithm':self.method_name}
        self.parms_dict.update(self.estimator.get_params())
        
        # check for existing datagroups with same results 
        self.process_name = 'Decomposition'
        # Partial groups don't make any sense for statistical learning algorithms....
        self.duplicate_h5_groups, self.h5_partial_groups = self._check_for_duplicates()

        # figure out the operation that needs need to be performed to convert to real scalar
        (self.data_transform_func, self.data_is_complex, self.data_is_compound,
         self.data_n_features, self.data_type_mult) = check_dtype(h5_main)

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)
        
        self.__components = None
        self.__projection = None

Exemple #7

0

Afficher le fichier

Fichier : decomposition.py Projet : pycroscopy/pycroscopy

    def __init__(self, h5_main, estimator):
        """
        Constructs the Decomposition object. Call the :meth:`~pycroscopy.processing.Decomposition.test()` and
        :meth:`~pycroscopy.processing.Decomposition.compute()` methods to run the decomposition
        
        Parameters
        ------------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset with embedded ancillary spectroscopic, position indices and values datasets
        estimator : :module:`sklearn.decomposition` object
            configured decomposition object to apply to the data
        """
        
        allowed_methods = [dec.factor_analysis.FactorAnalysis,
                           dec.fastica_.FastICA,
                           dec.incremental_pca.IncrementalPCA,
                           dec.sparse_pca.MiniBatchSparsePCA,
                           dec.nmf.NMF,
                           dec.pca.PCA,
                           dec.sparse_pca.SparsePCA,
                           dec.truncated_svd.TruncatedSVD]
        
        # Store the decomposition object
        self.estimator = estimator
        
        # could not find a nicer way to extract the method name yet
        self.method_name = str(estimator)[:str(estimator).index('(')]

        if type(estimator) not in allowed_methods:
            raise NotImplementedError('Cannot work with {} yet'.format(self.method_name))
            
        # Done with decomposition-related checks, now call super init
        super(Decomposition, self).__init__(h5_main)
        
        # set up parameters
        self.parms_dict = {'decomposition_algorithm':self.method_name}
        self.parms_dict.update(self.estimator.get_params())
        
        # check for existing datagroups with same results 
        self.process_name = 'Decomposition'
        # Partial groups don't make any sense for statistical learning algorithms....
        self.duplicate_h5_groups, self.h5_partial_groups = self._check_for_duplicates()

        # figure out the operation that needs need to be performed to convert to real scalar
        (self.data_transform_func, self.data_is_complex, self.data_is_compound,
         self.data_n_features, self.data_type_mult) = check_dtype(h5_main)

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)
        
        self.__components = None
        self.__projection = None

Exemple #8

0

Afficher le fichier

Fichier : svd_utils.py Projet : hyderkh/pycroscopy

    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.

        Parameters
        ----------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset that will be decomposed
        num_components : int, optional
            Number of components to decompose h5_main into.  Default None.
        h5_target_group : h5py.Group, optional. Default = None
            Location where to look for existing results and to place newly
            computed results. Use this kwarg if the results need to be written
            to a different HDF5 file. By default, this value is set to the
            parent group containing `h5_main`
        kwargs
            Arguments to be sent to Process
        """
        super(SVD, self).__init__(h5_main, 'SVD', **kwargs)
        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(
            h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None

Exemple #9

0

Afficher le fichier

    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.

        Parameters
        ----------
        h5_main : USIDataset
            Dataset to be decomposed.
        num_components : int, optional
            Number of components to decompose h5_main into.  Default None.
        kwargs
            Arguments to be sent to Process
        """
        super(SVD, self).__init__(h5_main, **kwargs)
        self.process_name = 'SVD'
        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(
            h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None

Exemple #10

0

Afficher le fichier

Fichier : svd_utils.py Projet : pycroscopy/pycroscopy

    def __init__(self, h5_main, num_components=None, **kwargs):
        """
        Perform the SVD decomposition on the selected dataset and write the results to h5 file.

        Parameters
        ----------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset that will be decomposed
        num_components : int, optional
            Number of components to decompose h5_main into.  Default None.
        kwargs
            Arguments to be sent to Process
        """
        super(SVD, self).__init__(h5_main, **kwargs)
        self.process_name = 'SVD'

        '''
        Calculate the size of the main data in memory and compare to max_mem
        We use the minimum of the actual dtype's itemsize and float32 since we
        don't want to read it in yet and do the proper type conversions.
        '''
        n_samples, n_features = h5_main.shape
        self.data_transform_func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_main)

        if num_components is None:
            num_components = min(n_samples, n_features)
        else:
            num_components = min(n_samples, n_features, num_components)

        self.num_components = num_components

        # Check that we can actually compute the SVD with the selected number of components
        self._check_available_mem()

        self.parms_dict = {'num_components': num_components}
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates()

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__u = None
        self.__v = None
        self.__s = None

Exemple #11

0

Afficher le fichier

def rebuild_svd(h5_main, components=None, cores=None, max_RAM_mb=1024):
    """
    Rebuild the Image from the SVD results on the windows
    Optionally, only use components less than n_comp.

    Parameters
    ----------
    h5_main : hdf5 Dataset
        dataset which SVD was performed on
    components : {int, iterable of int, slice} optional
        Defines which components to keep
        Default - None, all components kept

        Input Types
        integer : Components less than the input will be kept
        length 2 iterable of integers : Integers define start and stop of component slice to retain
        other iterable of integers or slice : Selection of component indices to retain
    cores : int, optional
        How many cores should be used to rebuild
        Default - None, all but 2 cores will be used, min 1
    max_RAM_mb : int, optional
        Maximum ammount of memory to use when rebuilding, in Mb.
        Default - 1024Mb

    Returns
    -------
    rebuilt_data : HDF5 Dataset
        the rebuilt dataset

    """
    comp_slice, num_comps = get_component_slice(
        components, total_components=h5_main.shape[1])
    if isinstance(comp_slice, np.ndarray):
        comp_slice = list(comp_slice)
    dset_name = h5_main.name.split('/')[-1]

    # Ensuring that at least one core is available for use / 2 cores are available for other use
    max_cores = max(1, cpu_count() - 2)
    #         print('max_cores',max_cores)
    if cores is not None:
        cores = min(round(abs(cores)), max_cores)
    else:
        cores = max_cores

    max_memory = min(max_RAM_mb * 1024**2, 0.75 * get_available_memory())
    if cores != 1:
        max_memory = int(max_memory / 2)
    '''
    Get the handles for the SVD results
    '''
    try:
        h5_svd_group = find_results_groups(h5_main, 'SVD')[-1]

        h5_S = h5_svd_group['S']
        h5_U = h5_svd_group['U']
        h5_V = h5_svd_group['V']

    except KeyError:
        raise KeyError(
            'SVD Results for {dset} were not found.'.format(dset=dset_name))
    except:
        raise

    func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_V)
    '''
    Calculate the size of a single batch that will fit in the available memory
    '''
    n_comps = h5_S[comp_slice].size
    mem_per_pix = (h5_U.dtype.itemsize +
                   h5_V.dtype.itemsize * h5_V.shape[1]) * n_comps
    fixed_mem = h5_main.size * h5_main.dtype.itemsize

    if cores is None:
        free_mem = max_memory - fixed_mem
    else:
        free_mem = max_memory * 2 - fixed_mem

    batch_size = int(round(float(free_mem) / mem_per_pix))
    batch_slices = gen_batches(h5_U.shape[0], batch_size)

    print('Reconstructing in batches of {} positions.'.format(batch_size))
    print('Batchs should be {} Mb each.'.format(mem_per_pix * batch_size /
                                                1024.0**2))
    '''
    Loop over all batches.
    '''
    ds_V = np.dot(np.diag(h5_S[comp_slice]), func(h5_V[comp_slice, :]))
    rebuild = np.zeros((h5_main.shape[0], ds_V.shape[1]))
    for ibatch, batch in enumerate(batch_slices):
        rebuild[batch, :] += np.dot(h5_U[batch, comp_slice], ds_V)

    rebuild = stack_real_to_target_dtype(rebuild, h5_V.dtype)

    print(
        'Completed reconstruction of data from SVD results.  Writing to file.')
    '''
    Create the Group and dataset to hold the rebuild data
    '''
    rebuilt_grp = create_indexed_group(h5_svd_group, 'Rebuilt_Data')
    h5_rebuilt = write_main_dataset(rebuilt_grp,
                                    rebuild,
                                    'Rebuilt_Data',
                                    get_attr(h5_main, 'quantity'),
                                    get_attr(h5_main, 'units'),
                                    None,
                                    None,
                                    h5_pos_inds=h5_main.h5_pos_inds,
                                    h5_pos_vals=h5_main.h5_pos_vals,
                                    h5_spec_inds=h5_main.h5_spec_inds,
                                    h5_spec_vals=h5_main.h5_spec_vals,
                                    chunks=h5_main.chunks,
                                    compression=h5_main.compression)

    if isinstance(comp_slice, slice):
        rebuilt_grp.attrs['components_used'] = '{}-{}'.format(
            comp_slice.start, comp_slice.stop)
    else:
        rebuilt_grp.attrs['components_used'] = components

    copy_attributes(h5_main, h5_rebuilt, skip_refs=False)

    h5_main.file.flush()

    print('Done writing reconstructed data to file.')

    return h5_rebuilt

Exemple #12

0

Afficher le fichier

Fichier : cluster.py Projet : nccreang/pycroscopy

    def __init__(self, h5_main, estimator, num_comps=None, **kwargs):
        """
        Constructs the Cluster object. Call the :meth:`~pycroscopy.processing.Cluster.test()` and
        :meth:`~pycroscopy.processing.Cluster.compute()` methods to run the clustering

        Parameters
        ----------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset
        estimator : :class:`sklearn.cluster` estimator
            configured clustering algorithm to be applied to the data
        num_comps : int (unsigned), optional. Default = None / all
            Number of features / spectroscopic indices to be used to cluster the data
        h5_target_group : h5py.Group, optional. Default = None
            Location where to look for existing results and to place newly
            computed results. Use this kwarg if the results need to be written
            to a different HDF5 file. By default, this value is set to the
            parent group containing `h5_main`
        """

        allowed_methods = [
            cls.AgglomerativeClustering, cls.Birch, cls.KMeans,
            cls.MiniBatchKMeans, cls.SpectralClustering
        ]

        # could not find a nicer way to extract the method name yet
        self.method_name = str(estimator)[:str(estimator).index('(')]

        if type(estimator) not in allowed_methods:
            raise TypeError('Cannot work with {} just yet'.format(
                self.method_name))

        # Done with decomposition-related checks, now call super init
        super(Cluster, self).__init__(h5_main, 'Cluster', **kwargs)

        # Store the decomposition object
        self.estimator = estimator

        if num_comps is None:
            comp_attr = 'all'

        comp_slice, num_comps = get_component_slice(
            num_comps, total_components=self.h5_main.shape[1])

        self.num_comps = num_comps
        self.data_slice = (slice(None), comp_slice)

        if isinstance(comp_slice, slice):
            # cannot store slice as an attribute in hdf5
            # convert to list of integers!
            inds = comp_slice.indices(self.h5_main.shape[1])
            # much like range, inds are arranged as (start, stop, step)
            if inds[0] == 0 and inds[2] == 1:
                # starting from 0 with step of 1 = upto N components
                if inds[1] >= self.h5_main.shape[1] - 1:
                    comp_attr = 'all'
                else:
                    comp_attr = inds[1]
            else:
                comp_attr = range(*inds)
        elif comp_attr == 'all':
            pass
        else:
            # subset of spectral components specified as an array
            comp_attr = comp_slice

        # set up parameters
        self.parms_dict = {
            'cluster_algorithm': self.method_name,
            'spectral_components': comp_attr
        }
        self.parms_dict.update(self.estimator.get_params())

        # update n_jobs according to the cores argument
        # print('cores reset to', self._cores)
        # different number of cores should not* be a reason for different results
        # so we update this flag only after checking for duplicates
        estimator.n_jobs = self._cores
        self.parms_dict.update({'n_jobs': self._cores})

        # check for existing datagroups with same results
        # Partial groups don't make any sense for statistical learning algorithms....
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates(
        )

        # figure out the operation that needs need to be performed to convert to real scalar
        (self.data_transform_func, self.data_is_complex, self.data_is_compound,
         self.data_n_features, self.data_type_mult) = check_dtype(h5_main)

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__labels = None
        self.__mean_resp = None

Exemple #13

0

Afficher le fichier

Fichier : test_dtype_utils.py Projet : shunsunsun/pyUSID

 def test_non_hdf(self):
     with self.assertRaises(TypeError):
         _ = dtype_utils.check_dtype(np.arange(15))

Exemple #14

0

Afficher le fichier

Fichier : cluster.py Projet : pycroscopy/pycroscopy

    def __init__(self, h5_main, estimator, num_comps=None):
        """
        Constructs the Cluster object. Call the :meth:`~pycroscopy.processing.Cluster.test()` and
        :meth:`~pycroscopy.processing.Cluster.compute()` methods to run the clustering

        Parameters
        ----------
        h5_main : :class:`pyUSID.USIDataset` object
            USID Main HDF5 dataset
        estimator : :class:`sklearn.cluster` estimator
            configured clustering algorithm to be applied to the data
        num_comps : int (unsigned), optional. Default = None / all
            Number of features / spectroscopic indices to be used to cluster the data
        """

        allowed_methods = [cls.AgglomerativeClustering,
                           cls.Birch,
                           cls.KMeans,
                           cls.MiniBatchKMeans,
                           cls.SpectralClustering]

        # could not find a nicer way to extract the method name yet
        self.method_name = str(estimator)[:str(estimator).index('(')]

        if type(estimator) not in allowed_methods:
            raise TypeError('Cannot work with {} just yet'.format(self.method_name))

        # Done with decomposition-related checks, now call super init
        super(Cluster, self).__init__(h5_main)

        # Store the decomposition object
        self.estimator = estimator

        if num_comps is None:
            comp_attr = 'all'

        comp_slice, num_comps = get_component_slice(num_comps, total_components=self.h5_main.shape[1])

        self.num_comps = num_comps
        self.data_slice = (slice(None), comp_slice)

        if isinstance(comp_slice, slice):
            # cannot store slice as an attribute in hdf5
            # convert to list of integers!
            inds = comp_slice.indices(self.h5_main.shape[1])
            # much like range, inds are arranged as (start, stop, step)
            if inds[0] == 0 and inds[2] == 1:
                # starting from 0 with step of 1 = upto N components
                if inds[1] >= self.h5_main.shape[1] - 1:
                    comp_attr = 'all'
                else:
                    comp_attr = inds[1]
            else:
                comp_attr = range(*inds)
        elif comp_attr == 'all':
            pass
        else:
            # subset of spectral components specified as an array
            comp_attr = comp_slice

        # set up parameters
        self.parms_dict = {'cluster_algorithm': self.method_name,
                           'spectral_components': comp_attr}
        self.parms_dict.update(self.estimator.get_params())

        # update n_jobs according to the cores argument
        # print('cores reset to', self._cores)
        # different number of cores should not* be a reason for different results
        # so we update this flag only after checking for duplicates
        estimator.n_jobs = self._cores
        self.parms_dict.update({'n_jobs': self._cores})

        # check for existing datagroups with same results
        self.process_name = 'Cluster'
        # Partial groups don't make any sense for statistical learning algorithms....
        self.duplicate_h5_groups, self.partial_h5_groups = self._check_for_duplicates()

        # figure out the operation that needs need to be performed to convert to real scalar
        (self.data_transform_func, self.data_is_complex, self.data_is_compound,
         self.data_n_features, self.data_type_mult) = check_dtype(h5_main)

        # supercharge h5_main!
        self.h5_main = USIDataset(self.h5_main)

        self.__labels = None
        self.__mean_resp = None

Exemple #15

0

Afficher le fichier

Fichier : svd_utils.py Projet : pycroscopy/pycroscopy

def rebuild_svd(h5_main, components=None, cores=None, max_RAM_mb=1024):
    """
    Rebuild the Image from the SVD results on the windows
    Optionally, only use components less than n_comp.

    Parameters
    ----------
    h5_main : hdf5 Dataset
        dataset which SVD was performed on
    components : {int, iterable of int, slice} optional
        Defines which components to keep
        Default - None, all components kept

        Input Types
        integer : Components less than the input will be kept
        length 2 iterable of integers : Integers define start and stop of component slice to retain
        other iterable of integers or slice : Selection of component indices to retain
    cores : int, optional
        How many cores should be used to rebuild
        Default - None, all but 2 cores will be used, min 1
    max_RAM_mb : int, optional
        Maximum ammount of memory to use when rebuilding, in Mb.
        Default - 1024Mb

    Returns
    -------
    rebuilt_data : HDF5 Dataset
        the rebuilt dataset

    """
    comp_slice, num_comps = get_component_slice(components, total_components=h5_main.shape[1])
    if isinstance(comp_slice, np.ndarray):
        comp_slice = list(comp_slice)
    dset_name = h5_main.name.split('/')[-1]

    # Ensuring that at least one core is available for use / 2 cores are available for other use
    max_cores = max(1, cpu_count() - 2)
    #         print('max_cores',max_cores)
    if cores is not None:
        cores = min(round(abs(cores)), max_cores)
    else:
        cores = max_cores

    max_memory = min(max_RAM_mb * 1024 ** 2, 0.75 * get_available_memory())
    if cores != 1:
        max_memory = int(max_memory / 2)

    '''
    Get the handles for the SVD results
    '''
    try:
        h5_svd_group = find_results_groups(h5_main, 'SVD')[-1]

        h5_S = h5_svd_group['S']
        h5_U = h5_svd_group['U']
        h5_V = h5_svd_group['V']

    except KeyError:
        raise KeyError('SVD Results for {dset} were not found.'.format(dset=dset_name))
    except:
        raise

    func, is_complex, is_compound, n_features, type_mult = check_dtype(h5_V)

    '''
    Calculate the size of a single batch that will fit in the available memory
    '''
    n_comps = h5_S[comp_slice].size
    mem_per_pix = (h5_U.dtype.itemsize + h5_V.dtype.itemsize * h5_V.shape[1]) * n_comps
    fixed_mem = h5_main.size * h5_main.dtype.itemsize

    if cores is None:
        free_mem = max_memory - fixed_mem
    else:
        free_mem = max_memory * 2 - fixed_mem

    batch_size = int(round(float(free_mem) / mem_per_pix))
    batch_slices = gen_batches(h5_U.shape[0], batch_size)

    print('Reconstructing in batches of {} positions.'.format(batch_size))
    print('Batchs should be {} Mb each.'.format(mem_per_pix * batch_size / 1024.0 ** 2))

    '''
    Loop over all batches.
    '''
    ds_V = np.dot(np.diag(h5_S[comp_slice]), func(h5_V[comp_slice, :]))
    rebuild = np.zeros((h5_main.shape[0], ds_V.shape[1]))
    for ibatch, batch in enumerate(batch_slices):
        rebuild[batch, :] += np.dot(h5_U[batch, comp_slice], ds_V)

    rebuild = stack_real_to_target_dtype(rebuild, h5_V.dtype)

    print('Completed reconstruction of data from SVD results.  Writing to file.')
    '''
    Create the Group and dataset to hold the rebuild data
    '''
    rebuilt_grp = create_indexed_group(h5_svd_group, 'Rebuilt_Data')
    h5_rebuilt = write_main_dataset(rebuilt_grp, rebuild, 'Rebuilt_Data',
                                    get_attr(h5_main, 'quantity'), get_attr(h5_main, 'units'),
                                    None, None,
                                    h5_pos_inds=h5_main.h5_pos_inds, h5_pos_vals=h5_main.h5_pos_vals,
                                    h5_spec_inds=h5_main.h5_spec_inds, h5_spec_vals=h5_main.h5_spec_vals,
                                    chunks=h5_main.chunks, compression=h5_main.compression)

    if isinstance(comp_slice, slice):
        rebuilt_grp.attrs['components_used'] = '{}-{}'.format(comp_slice.start, comp_slice.stop)
    else:
        rebuilt_grp.attrs['components_used'] = components

    copy_attributes(h5_main, h5_rebuilt, skip_refs=False)

    h5_main.file.flush()

    print('Done writing reconstructed data to file.')

    return h5_rebuilt