def test_linalg_consistent_names(): m, n = 20, 10 mat = np.random.rand(m, n) data = da.from_array(mat, chunks=(10, n), name='A') q1, r1 = qr(data) q2, r2 = qr(data) assert same_keys(q1, q2) assert same_keys(r1, r2) u1, s1, v1 = svd(data) u2, s2, v2 = svd(data) assert same_keys(u1, u2) assert same_keys(s1, s2) assert same_keys(v1, v2)
def test_linalg_consistent_names(): m, n = 20, 10 mat = np.random.rand(m, n) data = da.from_array(mat, chunks=(10, n), name="A") q1, r1 = qr(data) q2, r2 = qr(data) assert same_keys(q1, q2) assert same_keys(r1, r2) u1, s1, v1 = svd(data) u2, s2, v2 = svd(data) assert same_keys(u1, u2) assert same_keys(s1, s2) assert same_keys(v1, v2)
def test_svd_dtype_preservation(chunks, dtype): x = da.random.random((50, 50), chunks=chunks).astype(dtype) u, s, v = svd(x) assert u.dtype == s.dtype == v.dtype == dtype
def decomposition(self, normalize_poissonian_noise=False, algorithm="svd", output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, print_info=True, **kwargs): """Perform Incremental (Batch) decomposition on the data. The results are stored in ``self.learning_results``. Read more in the :ref:`User Guide <big_data.decomposition>`. Parameters ---------- normalize_poissonian_noise : bool, default False If True, scale the signal to normalize Poissonian noise using the approach described in [KeenanKotula2004]_. algorithm : {'svd', 'pca', 'orpca', 'ornmf'}, default 'svd' The decomposition algorithm to use. output_dimension : int or None, default None Number of components to keep/calculate. If None, keep all (only valid for 'svd' algorithm) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int or None, default None the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain at least ``output_dimension`` signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool, default True Reproject data on the learnt components (factors) after learning. print_info : bool, default True If True, print information about the decomposition being performed. In the case of sklearn.decomposition objects, this includes the values of all arguments of the chosen sklearn algorithm. **kwargs passed to the partial_fit/fit functions. References ---------- .. [KeenanKotula2004] M. Keenan and P. Kotula, "Accounting for Poisson noise in the multivariate analysis of ToF-SIMS spectrum images", Surf. Interface Anal 36(3) (2004): 203-212. See Also -------- * :py:meth:`~.learn.mva.MVA.decomposition` for non-lazy signals * :py:func:`dask.array.linalg.svd` * :py:class:`sklearn.decomposition.IncrementalPCA` * :py:class:`~.learn.rpca.ORPCA` * :py:class:`~.learn.ornmf.ORNMF` """ if kwargs.get("bounds", False): warnings.warn( "The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.", VisibleDeprecationWarning, ) kwargs.pop("bounds", None) # Deprecate 'ONMF' for 'ORNMF' if algorithm == "ONMF": warnings.warn( "The argument `algorithm='ONMF'` has been deprecated and may " "be removed in future. Please use `algorithm='ornmf'` instead.", VisibleDeprecationWarning, ) algorithm = "ornmf" # Deprecate uppercase to favour lowercase (consistent # with non-lazy decomposition) if algorithm in ["PCA", "ORPCA", "ORNMF"]: warnings.warn( "The argument `algorithm='{}'` has been deprecated and may " "be removed in future. Please use `algorithm='{}'` instead.". format(algorithm, algorithm.lower()), VisibleDeprecationWarning, ) algorithm = algorithm.lower() # Check algorithms requiring output_dimension algorithms_require_dimension = ["pca", "orpca", "ornmf"] if algorithm in algorithms_require_dimension and output_dimension is None: raise ValueError( "`output_dimension` must be specified for '{}'".format( algorithm)) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # Initialize return_info and print_info to_return = None to_print = [ "Decomposition info:", " normalize_poissonian_noise={}".format( normalize_poissonian_noise), " algorithm={}".format(algorithm), " output_dimension={}".format(output_dimension) ] # LEARN if algorithm == "pca": if not import_sklearn.sklearn_installed: raise ImportError("algorithm='pca' requires scikit-learn") obj = import_sklearn.sklearn.decomposition.IncrementalPCA( n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True to_print.extend(["scikit-learn estimator:", obj]) elif algorithm == "orpca": from hyperspy.learn.rpca import ORPCA batch_size = kwargs.pop("batch_size", None) obj = ORPCA(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm == "ornmf": from hyperspy.learn.ornmf import ORNMF batch_size = kwargs.pop("batch_size", None) obj = ORNMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "svd": raise ValueError("'algorithm' not recognised") original_data = self.data try: _logger.info("Performing decomposition analysis") if normalize_poissonian_noise: _logger.info("Scaling the data to normalize Poissonian noise") data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros(self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array(navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros(self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array(signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=tuple(range(ndim))), data.sum(axis=tuple(range(ndim, ndim + sdim))), ) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] * rbH[(None, ) * raG.ndim + (..., )] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "svd": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplementedError( "Masking is not yet implemented for lazy SVD") U, S, V = svd(self.data) if output_dimension is None: min_shape = min(min(U.shape), min(V.shape)) else: min_shape = output_dimension U = U[:, :min_shape] S = S[:min_shape] V = V[:min_shape] factors = V.T explained_variance = S**2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask, ), total=nblocks, leave=True, desc="Learn", ): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == "pca": explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == "orpca": factors, loadings = obj.finish() loadings = loadings.T elif algorithm == "ornmf": factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == "pca": method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == "orpca": method = obj.project def post(a): return np.concatenate(a, axis=1).T elif algorithm == "ornmf": method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map( lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask, ), ) H = [] try: for thing in progressbar(_map, total=nblocks, desc="Project"): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and explained_variance_ratio is None: explained_variance_ratio = explained_variance / explained_variance.sum( ) # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "svd": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks(loadings, ndim, (output_dimension, ), nav_chunks).reshape( (-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "svd": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis] # Print details about the decomposition we just performed if print_info: print("\n".join([str(pr) for pr in to_print]))
def partial_fit(self, X, y=None, check_input=True): """Incremental fit with X. All of X is processed as a single batch. Parameters ---------- X : array-like, shape (n_samples, n_features) Training data, where n_samples is the number of samples and n_features is the number of features. check_input : bool Run check_array on X. y : Ignored Returns ------- self : object Returns the instance itself. """ if check_input: if sparse.issparse(X): raise TypeError( "IncrementalPCA.partial_fit does not support " "sparse input. Either convert data to dense " "or use IncrementalPCA.fit to do so in batches.") X = check_array( X, copy=self.copy, dtype=[np.float64, np.float32], accept_multiple_blocks=True, ) n_samples, n_features = X.shape if not hasattr(self, "components_"): self.components_ = None if self.n_components is None: if self.components_ is None: self.n_components_ = min(n_samples, n_features) else: self.n_components_ = self.components_.shape[0] elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) elif not self.n_components <= n_samples: raise ValueError("n_components=%r must be less or equal to " "the batch number of samples " "%d." % (self.n_components, n_samples)) else: self.n_components_ = self.n_components if (self.components_ is not None) and (self.components_.shape[0] != self.n_components_): raise ValueError("Number of input features has changed from %i " "to %i between calls to partial_fit! Try " "setting n_components to a fixed value." % (self.components_.shape[0], self.n_components_)) # This is the first partial_fit if not hasattr(self, "n_samples_seen_"): self.n_samples_seen_ = 0 self.mean_ = 0.0 self.var_ = 0.0 # Update stats - they are 0 if this is the first step # The next line is equivalent with np.repeat(self.n_samples_seen_, X.shape[1]), # which dask-array does not support last_sample_count = np.tile(np.expand_dims(self.n_samples_seen_, 0), X.shape[1]) col_mean, col_var, n_total_samples = _incremental_mean_and_var( X, last_mean=self.mean_, last_variance=self.var_, last_sample_count=last_sample_count, ) n_total_samples = da.compute(n_total_samples[0])[0] # Whitening if self.n_samples_seen_ == 0: # If it is the first step, simply whiten X X -= col_mean else: col_batch_mean = np.mean(X, axis=0) X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = np.sqrt( (self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X = np.vstack(( self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction, )) # The following part is modified so that it can fit to large dask-array solver = self._get_solver(X, self.n_components_) if solver in {"full", "tsqr"}: U, S, V = linalg.svd(X) # manually implement full_matrix=False if V.shape[0] > len(S): V = V[:len(S)] if U.shape[1] > len(S): U = U[:, :len(S)] else: # randomized random_state = check_random_state(self.random_state) seed = draw_seed(random_state, np.iinfo("int32").max) n_power_iter = self.iterated_power U, S, V = linalg.svd_compressed(X, self.n_components_, n_power_iter=n_power_iter, seed=seed) U, V = svd_flip(U, V) explained_variance = S**2 / (n_total_samples - 1) components, singular_values = V, S # The following part is also updated for randomized solver, # which computes only a limited number of the singular values total_var = np.sum(col_var) explained_variance_ratio = (explained_variance / total_var * ((n_total_samples - 1) / n_total_samples)) actual_rank = min(n_features, n_total_samples) if self.n_components_ < actual_rank: if solver == "randomized": noise_variance = (total_var - explained_variance.sum()) / ( actual_rank - self.n_components_) else: noise_variance = da.mean( explained_variance[self.n_components_:]) else: noise_variance = 0.0 self.n_samples_seen_ = n_total_samples try: ( self.n_samples_, self.mean_, self.var_, self.n_features_, self.components_, self.explained_variance_, self.explained_variance_ratio_, self.singular_values_, self.noise_variance_, ) = compute( n_samples, col_mean, col_var, n_features, components[:self.n_components_], explained_variance[:self.n_components_], explained_variance_ratio[:self.n_components_], singular_values[:self.n_components_], noise_variance, ) except ValueError as e: if np.isnan([n_samples, n_features]).any(): msg = ( "Computation of the SVD raised an error. It is possible " "n_components is too large. i.e., " "`n_components > np.nanmin(X.shape) = " "np.nanmin({})`\n\n" "A possible resolution to this error is to ensure that " "n_components <= min(n_samples, n_features)") raise ValueError(msg.format(X.shape)) from e raise e if len(self.singular_values_) < self.n_components_: self.n_components_ = len(self.singular_values_) msg = ( "n_components={n} is larger than the number of singular values" " ({s}) (note: PCA has attributes as if n_components == {s})") raise ValueError( msg.format(n=self.n_components_, s=len(self.singular_values_))) return self
def decomposition(self, normalize_poissonian_noise=False, algorithm='svd', output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=False, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd', lazy SVD decomposition from dask. output_dimension : int the number of significant components to keep. If None, keep all (only valid for SVD) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ if bounds: msg = ("The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.") warnings.warn(msg, VisibleDeprecationWarning) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if algorithm != "svd" and output_dimension is None: raise ValueError("With the %s the output_dimension " "must be specified" % algorithm) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "svd": raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros(self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array(navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros(self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array(signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute(data.sum(axis=range(ndim)), data.sum(axis=range(ndim, ndim + sdim))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "svd": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplemented( "Masking is not yet implemented for lazy SVD.") U, S, V = svd(self.data) factors = V.T explained_variance = S**2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar(self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] def post(a): return obj.finish()[4] elif algorithm == 'ONMF': method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map( lambda thing: method(thing), self._block_iterator(flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar(_map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "svd": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks(loadings, ndim, (output_dimension, ), nav_chunks).reshape( (-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "svd": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis]
def decomposition(self, normalize_poissonian_noise=False, algorithm='svd', output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=False, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd', lazy SVD decomposition from dask. output_dimension : int the number of significant components to keep. If None, keep all (only valid for SVD) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ if bounds: msg = ( "The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.") warnings.warn(msg, VisibleDeprecationWarning) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if algorithm != "svd" and output_dimension is None: raise ValueError("With the %s the output_dimension " "must be specified" % algorithm) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "svd": raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros( self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array( navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros( self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array( signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=tuple(range(ndim))), data.sum(axis=tuple(range(ndim, ndim + sdim)))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "svd": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplemented( "Masking is not yet implemented for lazy SVD." ) U, S, V = svd(self.data) factors = V.T explained_variance = S ** 2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] def post(a): return obj.finish()[4] elif algorithm == 'ONMF': method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map(lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar( _map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "svd": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks( loadings, ndim, (output_dimension,), nav_chunks).reshape((-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "svd": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis]