def decomposition(self, output_dimension, normalize_poissonian_noise=False, algorithm='PCA', signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=True, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- output_dimension : int the number of significant components to keep normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA from scikit-learn is run. get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. bounds : {tuple, bool} The (min, max) values of the data to normalize before learning. If tuple (min, max), those values will be used for normalization. If True, extremes will be looked up (expensive), default. If False, no normalization is done (learning may be very slow). If normalize_poissonian_noise is True, this cannot be True. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) else: raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: if bounds is True: bounds = False # warnings.warn? data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros( self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array( navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros( self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array( signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=range(ndim)), data.sum(axis=range(ndim, ndim + sdim))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # normalize the data for learning algs: if bounds: if bounds is True: _min, _max = da.compute(self.data.min(), self.data.max()) else: _min, _max = bounds self.data = (self.data - _min) / (_max - _min) # LEARN this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] def post(a): return obj.finish()[4] elif algorithm == 'ONMF': method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map(lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar( _map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension try: loadings = _reshuffle_mixed_blocks( loadings, ndim, (output_dimension,), nav_chunks).reshape((-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio
def decomposition(self, normalize_poissonian_noise=False, algorithm='svd', output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=False, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd', lazy SVD decomposition from dask. output_dimension : int the number of significant components to keep. If None, keep all (only valid for SVD) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ if bounds: msg = ("The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.") warnings.warn(msg, VisibleDeprecationWarning) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if algorithm != "svd" and output_dimension is None: raise ValueError("With the %s the output_dimension " "must be specified" % algorithm) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "svd": raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros(self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array(navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros(self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array(signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute(data.sum(axis=range(ndim)), data.sum(axis=range(ndim, ndim + sdim))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "svd": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplemented( "Masking is not yet implemented for lazy SVD.") U, S, V = svd(self.data) factors = V.T explained_variance = S**2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar(self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] def post(a): return obj.finish()[4] elif algorithm == 'ONMF': method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map( lambda thing: method(thing), self._block_iterator(flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar(_map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "svd": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks(loadings, ndim, (output_dimension, ), nav_chunks).reshape( (-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "svd": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis]