Example #1
0
def test_linalg_consistent_names():
    m, n = 20, 10
    mat = np.random.rand(m, n)
    data = da.from_array(mat, chunks=(10, n), name='A')

    q1, r1 = qr(data)
    q2, r2 = qr(data)
    assert same_keys(q1, q2)
    assert same_keys(r1, r2)

    u1, s1, v1 = svd(data)
    u2, s2, v2 = svd(data)
    assert same_keys(u1, u2)
    assert same_keys(s1, s2)
    assert same_keys(v1, v2)
Example #2
0
def test_linalg_consistent_names():
    m, n = 20, 10
    mat = np.random.rand(m, n)
    data = da.from_array(mat, chunks=(10, n), name="A")

    q1, r1 = qr(data)
    q2, r2 = qr(data)
    assert same_keys(q1, q2)
    assert same_keys(r1, r2)

    u1, s1, v1 = svd(data)
    u2, s2, v2 = svd(data)
    assert same_keys(u1, u2)
    assert same_keys(s1, s2)
    assert same_keys(v1, v2)
Example #3
0
def test_svd_dtype_preservation(chunks, dtype):
    x = da.random.random((50, 50), chunks=chunks).astype(dtype)
    u, s, v = svd(x)
    assert u.dtype == s.dtype == v.dtype == dtype
Example #4
0
    def decomposition(self,
                      normalize_poissonian_noise=False,
                      algorithm="svd",
                      output_dimension=None,
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      print_info=True,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data.

        The results are stored in ``self.learning_results``.

        Read more in the :ref:`User Guide <big_data.decomposition>`.

        Parameters
        ----------
        normalize_poissonian_noise : bool, default False
            If True, scale the signal to normalize Poissonian noise using
            the approach described in [KeenanKotula2004]_.
        algorithm : {'svd', 'pca', 'orpca', 'ornmf'}, default 'svd'
            The decomposition algorithm to use.
        output_dimension : int or None, default None
            Number of components to keep/calculate. If None, keep all
            (only valid for 'svd' algorithm)
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int or None, default None
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain at least ``output_dimension`` signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool, default True
            Reproject data on the learnt components (factors) after learning.
        print_info : bool, default True
            If True, print information about the decomposition being performed.
            In the case of sklearn.decomposition objects, this includes the
            values of all arguments of the chosen sklearn algorithm.
        **kwargs
            passed to the partial_fit/fit functions.

        References
        ----------
        .. [KeenanKotula2004] M. Keenan and P. Kotula, "Accounting for Poisson noise
            in the multivariate analysis of ToF-SIMS spectrum images", Surf.
            Interface Anal 36(3) (2004): 203-212.

        See Also
        --------
        * :py:meth:`~.learn.mva.MVA.decomposition` for non-lazy signals
        * :py:func:`dask.array.linalg.svd`
        * :py:class:`sklearn.decomposition.IncrementalPCA`
        * :py:class:`~.learn.rpca.ORPCA`
        * :py:class:`~.learn.ornmf.ORNMF`

        """
        if kwargs.get("bounds", False):
            warnings.warn(
                "The `bounds` keyword is deprecated and will be removed "
                "in v2.0. Since version > 1.3 this has no effect.",
                VisibleDeprecationWarning,
            )
            kwargs.pop("bounds", None)

        # Deprecate 'ONMF' for 'ORNMF'
        if algorithm == "ONMF":
            warnings.warn(
                "The argument `algorithm='ONMF'` has been deprecated and may "
                "be removed in future. Please use `algorithm='ornmf'` instead.",
                VisibleDeprecationWarning,
            )
            algorithm = "ornmf"

        # Deprecate uppercase to favour lowercase (consistent
        # with non-lazy decomposition)
        if algorithm in ["PCA", "ORPCA", "ORNMF"]:
            warnings.warn(
                "The argument `algorithm='{}'` has been deprecated and may "
                "be removed in future. Please use `algorithm='{}'` instead.".
                format(algorithm, algorithm.lower()),
                VisibleDeprecationWarning,
            )
            algorithm = algorithm.lower()

        # Check algorithms requiring output_dimension
        algorithms_require_dimension = ["pca", "orpca", "ornmf"]
        if algorithm in algorithms_require_dimension and output_dimension is None:
            raise ValueError(
                "`output_dimension` must be specified for '{}'".format(
                    algorithm))

        explained_variance = None
        explained_variance_ratio = None

        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])

        if output_dimension and blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)

        blocksize *= num_chunks

        # Initialize return_info and print_info
        to_return = None
        to_print = [
            "Decomposition info:", "  normalize_poissonian_noise={}".format(
                normalize_poissonian_noise),
            "  algorithm={}".format(algorithm),
            "  output_dimension={}".format(output_dimension)
        ]

        # LEARN
        if algorithm == "pca":
            if not import_sklearn.sklearn_installed:
                raise ImportError("algorithm='pca' requires scikit-learn")

            obj = import_sklearn.sklearn.decomposition.IncrementalPCA(
                n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True
            to_print.extend(["scikit-learn estimator:", obj])

        elif algorithm == "orpca":
            from hyperspy.learn.rpca import ORPCA

            batch_size = kwargs.pop("batch_size", None)
            obj = ORPCA(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        elif algorithm == "ornmf":
            from hyperspy.learn.ornmf import ORNMF

            batch_size = kwargs.pop("batch_size", None)
            obj = ORNMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        elif algorithm != "svd":
            raise ValueError("'algorithm' not recognised")

        original_data = self.data
        try:
            _logger.info("Performing decomposition analysis")

            if normalize_poissonian_noise:
                _logger.info("Scaling the data to normalize Poissonian noise")

                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(self.axes_manager.navigation_shape[::-1],
                             chunks=nav_chunks) if navigation_mask is None else
                    to_array(navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(self.axes_manager.signal_shape[::-1],
                             chunks=sig_chunks) if signal_mask is None else
                    to_array(signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=tuple(range(ndim))),
                    data.sum(axis=tuple(range(ndim, ndim + sdim))),
                )
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) +
                            (None, ) * rbH.ndim] * rbH[(None, ) * raG.ndim +
                                                       (..., )]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # LEARN
            if algorithm == "svd":
                reproject = False
                from dask.array.linalg import svd

                try:
                    self._unfolded4decomposition = self.unfold()
                    # TODO: implement masking
                    if navigation_mask or signal_mask:
                        raise NotImplementedError(
                            "Masking is not yet implemented for lazy SVD")

                    U, S, V = svd(self.data)

                    if output_dimension is None:
                        min_shape = min(min(U.shape), min(V.shape))
                    else:
                        min_shape = output_dimension

                    U = U[:, :min_shape]
                    S = S[:min_shape]
                    V = V[:min_shape]

                    factors = V.T
                    explained_variance = S**2 / self.data.shape[0]
                    loadings = U * S
                finally:
                    if self._unfolded4decomposition is True:
                        self.fold()
                        self._unfolded4decomposition is False
            else:
                this_data = []
                try:
                    for chunk in progressbar(
                            self._block_iterator(
                                flat_signal=True,
                                get=get,
                                signal_mask=signal_mask,
                                navigation_mask=navigation_mask,
                            ),
                            total=nblocks,
                            leave=True,
                            desc="Learn",
                    ):
                        this_data.append(chunk)
                        if len(this_data) == num_chunks:
                            thedata = np.concatenate(this_data, axis=0)
                            method(thedata)
                            this_data = []
                    if len(this_data):
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                except KeyboardInterrupt:
                    pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == "pca":
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == "orpca":
                factors, loadings = obj.finish()
                loadings = loadings.T

            elif algorithm == "ornmf":
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == "pca":
                    method = obj.transform

                    def post(a):
                        return np.concatenate(a, axis=0)

                elif algorithm == "orpca":
                    method = obj.project

                    def post(a):
                        return np.concatenate(a, axis=1).T

                elif algorithm == "ornmf":
                    method = obj.project

                    def post(a):
                        return np.concatenate(a, axis=1).T

                _map = map(
                    lambda thing: method(thing),
                    self._block_iterator(
                        flat_signal=True,
                        get=get,
                        signal_mask=signal_mask,
                        navigation_mask=navigation_mask,
                    ),
                )
                H = []
                try:
                    for thing in progressbar(_map,
                                             total=nblocks,
                                             desc="Project"):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and explained_variance_ratio is None:
                explained_variance_ratio = explained_variance / explained_variance.sum(
                )

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            if algorithm != "svd":  # Only needed for online algorithms
                try:
                    loadings = _reshuffle_mixed_blocks(loadings, ndim,
                                                       (output_dimension, ),
                                                       nav_chunks).reshape(
                                                           (-1,
                                                            output_dimension))
                except ValueError:
                    # In case the projection step was not finished, it's left
                    # as scrambled
                    pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        if algorithm != "svd":
            target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio

        # Rescale the results if the noise was normalized
        if normalize_poissonian_noise is True:
            target.factors = target.factors * rbH.ravel()[:, np.newaxis]
            target.loadings = target.loadings * raG.ravel()[:, np.newaxis]

        # Print details about the decomposition we just performed
        if print_info:
            print("\n".join([str(pr) for pr in to_print]))
    def partial_fit(self, X, y=None, check_input=True):
        """Incremental fit with X. All of X is processed as a single batch.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        check_input : bool
            Run check_array on X.

        y : Ignored

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        if check_input:
            if sparse.issparse(X):
                raise TypeError(
                    "IncrementalPCA.partial_fit does not support "
                    "sparse input. Either convert data to dense "
                    "or use IncrementalPCA.fit to do so in batches.")
            X = check_array(
                X,
                copy=self.copy,
                dtype=[np.float64, np.float32],
                accept_multiple_blocks=True,
            )
        n_samples, n_features = X.shape
        if not hasattr(self, "components_"):
            self.components_ = None

        if self.n_components is None:
            if self.components_ is None:
                self.n_components_ = min(n_samples, n_features)
            else:
                self.n_components_ = self.components_.shape[0]
        elif not 1 <= self.n_components <= n_features:
            raise ValueError("n_components=%r invalid for n_features=%d, need "
                             "more rows than columns for IncrementalPCA "
                             "processing" % (self.n_components, n_features))
        elif not self.n_components <= n_samples:
            raise ValueError("n_components=%r must be less or equal to "
                             "the batch number of samples "
                             "%d." % (self.n_components, n_samples))
        else:
            self.n_components_ = self.n_components

        if (self.components_ is not None) and (self.components_.shape[0] !=
                                               self.n_components_):
            raise ValueError("Number of input features has changed from %i "
                             "to %i between calls to partial_fit! Try "
                             "setting n_components to a fixed value." %
                             (self.components_.shape[0], self.n_components_))

        # This is the first partial_fit
        if not hasattr(self, "n_samples_seen_"):
            self.n_samples_seen_ = 0
            self.mean_ = 0.0
            self.var_ = 0.0

        # Update stats - they are 0 if this is the first step
        # The next line is equivalent with np.repeat(self.n_samples_seen_, X.shape[1]),
        # which dask-array does not support
        last_sample_count = np.tile(np.expand_dims(self.n_samples_seen_, 0),
                                    X.shape[1])
        col_mean, col_var, n_total_samples = _incremental_mean_and_var(
            X,
            last_mean=self.mean_,
            last_variance=self.var_,
            last_sample_count=last_sample_count,
        )
        n_total_samples = da.compute(n_total_samples[0])[0]

        # Whitening
        if self.n_samples_seen_ == 0:
            # If it is the first step, simply whiten X
            X -= col_mean
        else:
            col_batch_mean = np.mean(X, axis=0)
            X -= col_batch_mean
            # Build matrix of combined previous basis and new data
            mean_correction = np.sqrt(
                (self.n_samples_seen_ * n_samples) /
                n_total_samples) * (self.mean_ - col_batch_mean)
            X = np.vstack((
                self.singular_values_.reshape((-1, 1)) * self.components_,
                X,
                mean_correction,
            ))

        # The following part is modified so that it can fit to large dask-array
        solver = self._get_solver(X, self.n_components_)
        if solver in {"full", "tsqr"}:
            U, S, V = linalg.svd(X)
            # manually implement full_matrix=False
            if V.shape[0] > len(S):
                V = V[:len(S)]
            if U.shape[1] > len(S):
                U = U[:, :len(S)]
        else:
            # randomized
            random_state = check_random_state(self.random_state)
            seed = draw_seed(random_state, np.iinfo("int32").max)
            n_power_iter = self.iterated_power
            U, S, V = linalg.svd_compressed(X,
                                            self.n_components_,
                                            n_power_iter=n_power_iter,
                                            seed=seed)
        U, V = svd_flip(U, V)
        explained_variance = S**2 / (n_total_samples - 1)
        components, singular_values = V, S

        # The following part is also updated for randomized solver,
        # which computes only a limited number of the singular values
        total_var = np.sum(col_var)
        explained_variance_ratio = (explained_variance / total_var *
                                    ((n_total_samples - 1) / n_total_samples))

        actual_rank = min(n_features, n_total_samples)
        if self.n_components_ < actual_rank:
            if solver == "randomized":
                noise_variance = (total_var - explained_variance.sum()) / (
                    actual_rank - self.n_components_)
            else:
                noise_variance = da.mean(
                    explained_variance[self.n_components_:])
        else:
            noise_variance = 0.0

        self.n_samples_seen_ = n_total_samples

        try:
            (
                self.n_samples_,
                self.mean_,
                self.var_,
                self.n_features_,
                self.components_,
                self.explained_variance_,
                self.explained_variance_ratio_,
                self.singular_values_,
                self.noise_variance_,
            ) = compute(
                n_samples,
                col_mean,
                col_var,
                n_features,
                components[:self.n_components_],
                explained_variance[:self.n_components_],
                explained_variance_ratio[:self.n_components_],
                singular_values[:self.n_components_],
                noise_variance,
            )
        except ValueError as e:
            if np.isnan([n_samples, n_features]).any():
                msg = (
                    "Computation of the SVD raised an error. It is possible "
                    "n_components is too large. i.e., "
                    "`n_components > np.nanmin(X.shape) = "
                    "np.nanmin({})`\n\n"
                    "A possible resolution to this error is to ensure that "
                    "n_components <= min(n_samples, n_features)")
                raise ValueError(msg.format(X.shape)) from e
            raise e

        if len(self.singular_values_) < self.n_components_:
            self.n_components_ = len(self.singular_values_)
            msg = (
                "n_components={n} is larger than the number of singular values"
                " ({s}) (note: PCA has attributes as if n_components == {s})")
            raise ValueError(
                msg.format(n=self.n_components_, s=len(self.singular_values_)))

        return self
Example #6
0
    def decomposition(self,
                      normalize_poissonian_noise=False,
                      algorithm='svd',
                      output_dimension=None,
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=False,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd',
            lazy SVD decomposition from dask.
        output_dimension : int
            the number of significant components to keep. If None, keep all
            (only valid for SVD)
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        if bounds:
            msg = ("The `bounds` keyword is deprecated and will be removed "
                   "in v2.0. Since version > 1.3 this has no effect.")
            warnings.warn(msg, VisibleDeprecationWarning)
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if algorithm != "svd" and output_dimension is None:
            raise ValueError("With the %s the output_dimension "
                             "must be specified" % algorithm)
        if output_dimension and blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks
        # LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)
        elif algorithm != "svd":
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(self.axes_manager.navigation_shape[::-1],
                             chunks=nav_chunks) if navigation_mask is None else
                    to_array(navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(self.axes_manager.signal_shape[::-1],
                             chunks=sig_chunks) if signal_mask is None else
                    to_array(signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(data.sum(axis=range(ndim)),
                                    data.sum(axis=range(ndim, ndim + sdim)))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, ) * rbH.ndim] *\
                    rbH[(None, ) * raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # LEARN
            if algorithm == "svd":
                reproject = False
                from dask.array.linalg import svd
                try:
                    self._unfolded4decomposition = self.unfold()
                    # TODO: implement masking
                    if navigation_mask or signal_mask:
                        raise NotImplemented(
                            "Masking is not yet implemented for lazy SVD.")
                    U, S, V = svd(self.data)
                    factors = V.T
                    explained_variance = S**2 / self.data.shape[0]
                    loadings = U * S
                finally:
                    if self._unfolded4decomposition is True:
                        self.fold()
                        self._unfolded4decomposition is False
            else:
                this_data = []
                try:
                    for chunk in progressbar(self._block_iterator(
                            flat_signal=True,
                            get=get,
                            signal_mask=signal_mask,
                            navigation_mask=navigation_mask),
                                             total=nblocks,
                                             leave=True,
                                             desc='Learn'):
                        this_data.append(chunk)
                        if len(this_data) == num_chunks:
                            thedata = np.concatenate(this_data, axis=0)
                            method(thedata)
                            this_data = []
                    if len(this_data):
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                except KeyboardInterrupt:
                    pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform

                    def post(a):
                        return np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []

                    def post(a):
                        return obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project

                    def post(a):
                        return np.concatenate(a, axis=1).T

                _map = map(
                    lambda thing: method(thing),
                    self._block_iterator(flat_signal=True,
                                         get=get,
                                         signal_mask=signal_mask,
                                         navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(_map,
                                             total=nblocks,
                                             desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            if algorithm != "svd":  # Only needed for online algorithms
                try:
                    loadings = _reshuffle_mixed_blocks(loadings, ndim,
                                                       (output_dimension, ),
                                                       nav_chunks).reshape(
                                                           (-1,
                                                            output_dimension))
                except ValueError:
                    # In case the projection step was not finished, it's left
                    # as scrambled
                    pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        if algorithm != "svd":
            target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio

        # Rescale the results if the noise was normalized
        if normalize_poissonian_noise is True:
            target.factors = target.factors * rbH.ravel()[:, np.newaxis]
            target.loadings = target.loadings * raG.ravel()[:, np.newaxis]
Example #7
0
    def decomposition(self,
                      normalize_poissonian_noise=False,
                      algorithm='svd',
                      output_dimension=None,
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=False,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd',
            lazy SVD decomposition from dask.
        output_dimension : int
            the number of significant components to keep. If None, keep all
            (only valid for SVD)
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        if bounds:
            msg = (
                "The `bounds` keyword is deprecated and will be removed "
                "in v2.0. Since version > 1.3 this has no effect.")
            warnings.warn(msg, VisibleDeprecationWarning)
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if algorithm != "svd" and output_dimension is None:
            raise ValueError("With the %s the output_dimension "
                             "must be specified" % algorithm)
        if output_dimension and blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks
        # LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)
        elif algorithm != "svd":
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(
                        self.axes_manager.navigation_shape[::-1],
                        chunks=nav_chunks)
                    if navigation_mask is None else to_array(
                        navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(
                        self.axes_manager.signal_shape[::-1],
                        chunks=sig_chunks)
                    if signal_mask is None else to_array(
                        signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=tuple(range(ndim))),
                    data.sum(axis=tuple(range(ndim, ndim + sdim))))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, ) * rbH.ndim] *\
                    rbH[(None, ) * raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # LEARN
            if algorithm == "svd":
                reproject = False
                from dask.array.linalg import svd
                try:
                    self._unfolded4decomposition = self.unfold()
                    # TODO: implement masking
                    if navigation_mask or signal_mask:
                        raise NotImplemented(
                            "Masking is not yet implemented for lazy SVD."
                        )
                    U, S, V = svd(self.data)
                    factors = V.T
                    explained_variance = S ** 2 / self.data.shape[0]
                    loadings = U * S
                finally:
                    if self._unfolded4decomposition is True:
                        self.fold()
                        self._unfolded4decomposition is False
            else:
                this_data = []
                try:
                    for chunk in progressbar(
                            self._block_iterator(
                                flat_signal=True,
                                get=get,
                                signal_mask=signal_mask,
                                navigation_mask=navigation_mask),
                            total=nblocks,
                            leave=True,
                            desc='Learn'):
                        this_data.append(chunk)
                        if len(this_data) == num_chunks:
                            thedata = np.concatenate(this_data, axis=0)
                            method(thedata)
                            this_data = []
                    if len(this_data):
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                except KeyboardInterrupt:
                    pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform

                    def post(a): return np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []

                    def post(a): return obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project

                    def post(a): return np.concatenate(a, axis=1).T

                _map = map(lambda thing: method(thing),
                           self._block_iterator(
                               flat_signal=True,
                               get=get,
                               signal_mask=signal_mask,
                               navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(
                            _map, total=nblocks, desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            if algorithm != "svd":  # Only needed for online algorithms
                try:
                    loadings = _reshuffle_mixed_blocks(
                        loadings,
                        ndim,
                        (output_dimension,),
                        nav_chunks).reshape((-1, output_dimension))
                except ValueError:
                    # In case the projection step was not finished, it's left
                    # as scrambled
                    pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        if algorithm != "svd":
            target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio

        # Rescale the results if the noise was normalized
        if normalize_poissonian_noise is True:
            target.factors = target.factors * rbH.ravel()[:, np.newaxis]
            target.loadings = target.loadings * raG.ravel()[:, np.newaxis]