Beispiel #1
0
def _remove_bad_pixels(dask_array, bad_pixel_array):
    """Replace values in bad pixels with mean of neighbors.

    Parameters
    ----------
    dask_array : Dask array
        Must be at least two dimensions
    bad_pixel_array : array-like
        Must either have the same shape as dask_array,
        or the same shape as the two last dimensions of dask_array.

    Returns
    -------
    data_output : Dask array

    Examples
    --------
    >>> import pyxem.utils.dask_tools as dt
    >>> s = pxm.dummy_data.dummy_data.get_dead_pixel_signal(lazy=True)
    >>> dead_pixels = dt._find_dead_pixels(s.data)
    >>> data_output = dt._remove_bad_pixels(s.data, dead_pixels)

    """
    if len(dask_array.shape) < 2:
        raise ValueError("dask_array {0} must be at least 2 dimensions".format(
            dask_array.shape))
    if bad_pixel_array.shape == dask_array.shape:
        pass
    elif bad_pixel_array.shape == dask_array.shape[-2:]:
        temp_array = da.zeros_like(dask_array)
        bad_pixel_array = da.add(temp_array, bad_pixel_array)
    else:
        raise ValueError(
            "bad_pixel_array {0} must either 2-D and have the same shape "
            "as the two last dimensions in dask_array {1}. Or be "
            "the same shape as dask_array {2}".format(bad_pixel_array.shape,
                                                      dask_array.shape[-2:],
                                                      dask_array.shape))
    dif0 = da.roll(dask_array, shift=1, axis=-2)
    dif1 = da.roll(dask_array, shift=-1, axis=-2)
    dif2 = da.roll(dask_array, shift=1, axis=-1)
    dif3 = da.roll(dask_array, shift=-1, axis=-1)

    dif = (dif0 + dif1 + dif2 + dif3) / 4
    dif = dif * bad_pixel_array

    data_output = da.multiply(dask_array, da.logical_not(bad_pixel_array))
    data_output = data_output + dif

    return data_output
Beispiel #2
0
def test_arithmetic():
    x = np.arange(5).astype('f4') + 2
    y = np.arange(5).astype('i8') + 2
    z = np.arange(5).astype('i4') + 2
    a = da.from_array(x, chunks=(2,))
    b = da.from_array(y, chunks=(2,))
    c = da.from_array(z, chunks=(2,))
    assert eq(a + b, x + y)
    assert eq(a * b, x * y)
    assert eq(a - b, x - y)
    assert eq(a / b, x / y)
    assert eq(b & b, y & y)
    assert eq(b | b, y | y)
    assert eq(b ^ b, y ^ y)
    assert eq(a // b, x // y)
    assert eq(a ** b, x ** y)
    assert eq(a % b, x % y)
    assert eq(a > b, x > y)
    assert eq(a < b, x < y)
    assert eq(a >= b, x >= y)
    assert eq(a <= b, x <= y)
    assert eq(a == b, x == y)
    assert eq(a != b, x != y)

    assert eq(a + 2, x + 2)
    assert eq(a * 2, x * 2)
    assert eq(a - 2, x - 2)
    assert eq(a / 2, x / 2)
    assert eq(b & True, y & True)
    assert eq(b | True, y | True)
    assert eq(b ^ True, y ^ True)
    assert eq(a // 2, x // 2)
    assert eq(a ** 2, x ** 2)
    assert eq(a % 2, x % 2)
    assert eq(a > 2, x > 2)
    assert eq(a < 2, x < 2)
    assert eq(a >= 2, x >= 2)
    assert eq(a <= 2, x <= 2)
    assert eq(a == 2, x == 2)
    assert eq(a != 2, x != 2)

    assert eq(2 + b, 2 + y)
    assert eq(2 * b, 2 * y)
    assert eq(2 - b, 2 - y)
    assert eq(2 / b, 2 / y)
    assert eq(True & b, True & y)
    assert eq(True | b, True | y)
    assert eq(True ^ b, True ^ y)
    assert eq(2 // b, 2 // y)
    assert eq(2 ** b, 2 ** y)
    assert eq(2 % b, 2 % y)
    assert eq(2 > b, 2 > y)
    assert eq(2 < b, 2 < y)
    assert eq(2 >= b, 2 >= y)
    assert eq(2 <= b, 2 <= y)
    assert eq(2 == b, 2 == y)
    assert eq(2 != b, 2 != y)

    assert eq(-a, -x)
    assert eq(abs(a), abs(x))
    assert eq(~(a == b), ~(x == y))
    assert eq(~(a == b), ~(x == y))

    assert eq(da.logaddexp(a, b), np.logaddexp(x, y))
    assert eq(da.logaddexp2(a, b), np.logaddexp2(x, y))
    assert eq(da.exp(b), np.exp(y))
    assert eq(da.log(a), np.log(x))
    assert eq(da.log10(a), np.log10(x))
    assert eq(da.log1p(a), np.log1p(x))
    assert eq(da.expm1(b), np.expm1(y))
    assert eq(da.sqrt(a), np.sqrt(x))
    assert eq(da.square(a), np.square(x))

    assert eq(da.sin(a), np.sin(x))
    assert eq(da.cos(b), np.cos(y))
    assert eq(da.tan(a), np.tan(x))
    assert eq(da.arcsin(b/10), np.arcsin(y/10))
    assert eq(da.arccos(b/10), np.arccos(y/10))
    assert eq(da.arctan(b/10), np.arctan(y/10))
    assert eq(da.arctan2(b*10, a), np.arctan2(y*10, x))
    assert eq(da.hypot(b, a), np.hypot(y, x))
    assert eq(da.sinh(a), np.sinh(x))
    assert eq(da.cosh(b), np.cosh(y))
    assert eq(da.tanh(a), np.tanh(x))
    assert eq(da.arcsinh(b*10), np.arcsinh(y*10))
    assert eq(da.arccosh(b*10), np.arccosh(y*10))
    assert eq(da.arctanh(b/10), np.arctanh(y/10))
    assert eq(da.deg2rad(a), np.deg2rad(x))
    assert eq(da.rad2deg(a), np.rad2deg(x))

    assert eq(da.logical_and(a < 1, b < 4), np.logical_and(x < 1, y < 4))
    assert eq(da.logical_or(a < 1, b < 4), np.logical_or(x < 1, y < 4))
    assert eq(da.logical_xor(a < 1, b < 4), np.logical_xor(x < 1, y < 4))
    assert eq(da.logical_not(a < 1), np.logical_not(x < 1))
    assert eq(da.maximum(a, 5 - a), np.maximum(a, 5 - a))
    assert eq(da.minimum(a, 5 - a), np.minimum(a, 5 - a))
    assert eq(da.fmax(a, 5 - a), np.fmax(a, 5 - a))
    assert eq(da.fmin(a, 5 - a), np.fmin(a, 5 - a))

    assert eq(da.isreal(a + 1j * b), np.isreal(x + 1j * y))
    assert eq(da.iscomplex(a + 1j * b), np.iscomplex(x + 1j * y))
    assert eq(da.isfinite(a), np.isfinite(x))
    assert eq(da.isinf(a), np.isinf(x))
    assert eq(da.isnan(a), np.isnan(x))
    assert eq(da.signbit(a - 3), np.signbit(x - 3))
    assert eq(da.copysign(a - 3, b), np.copysign(x - 3, y))
    assert eq(da.nextafter(a - 3, b), np.nextafter(x - 3, y))
    assert eq(da.ldexp(c, c), np.ldexp(z, z))
    assert eq(da.fmod(a * 12, b), np.fmod(x * 12, y))
    assert eq(da.floor(a * 0.5), np.floor(x * 0.5))
    assert eq(da.ceil(a), np.ceil(x))
    assert eq(da.trunc(a / 2), np.trunc(x / 2))

    assert eq(da.degrees(b), np.degrees(y))
    assert eq(da.radians(a), np.radians(x))

    assert eq(da.rint(a + 0.3), np.rint(x + 0.3))
    assert eq(da.fix(a - 2.5), np.fix(x - 2.5))

    assert eq(da.angle(a + 1j), np.angle(x + 1j))
    assert eq(da.real(a + 1j), np.real(x + 1j))
    assert eq((a + 1j).real, np.real(x + 1j))
    assert eq(da.imag(a + 1j), np.imag(x + 1j))
    assert eq((a + 1j).imag, np.imag(x + 1j))
    assert eq(da.conj(a + 1j * b), np.conj(x + 1j * y))
    assert eq((a + 1j * b).conj(), (x + 1j * y).conj())

    assert eq(da.clip(b, 1, 4), np.clip(y, 1, 4))
    assert eq(da.fabs(b), np.fabs(y))
    assert eq(da.sign(b - 2), np.sign(y - 2))

    l1, l2 = da.frexp(a)
    r1, r2 = np.frexp(x)
    assert eq(l1, r1)
    assert eq(l2, r2)

    l1, l2 = da.modf(a)
    r1, r2 = np.modf(x)
    assert eq(l1, r1)
    assert eq(l2, r2)

    assert eq(da.around(a, -1), np.around(x, -1))
Beispiel #3
0
    def decomposition(self,
                      output_dimension,
                      normalize_poissonian_noise=False,
                      algorithm='PCA',
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=True,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        output_dimension : int
            the number of significant components to keep
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA
            from scikit-learn is run.
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        bounds : {tuple, bool}
            The (min, max) values of the data to normalize before learning.
            If tuple (min, max), those values will be used for normalization.
            If True, extremes will be looked up (expensive), default.
            If False, no normalization is done (learning may be very slow).
            If normalize_poissonian_noise is True, this cannot be True.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks

        # LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        else:
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                if bounds is True:
                    bounds = False
                    # warnings.warn?
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(
                        self.axes_manager.navigation_shape[::-1],
                        chunks=nav_chunks)
                    if navigation_mask is None else to_array(
                        navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(
                        self.axes_manager.signal_shape[::-1],
                        chunks=sig_chunks)
                    if signal_mask is None else to_array(
                        signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=range(ndim)),
                    data.sum(axis=range(ndim, ndim + sdim)))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, ) * rbH.ndim] *\
                    rbH[(None, ) * raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # normalize the data for learning algs:
            if bounds:
                if bounds is True:
                    _min, _max = da.compute(self.data.min(), self.data.max())
                else:
                    _min, _max = bounds
                self.data = (self.data - _min) / (_max - _min)

            # LEARN
            this_data = []
            try:
                for chunk in progressbar(
                        self._block_iterator(
                            flat_signal=True,
                            get=get,
                            signal_mask=signal_mask,
                            navigation_mask=navigation_mask),
                        total=nblocks,
                        leave=True,
                        desc='Learn'):
                    this_data.append(chunk)
                    if len(this_data) == num_chunks:
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                        this_data = []
                if len(this_data):
                    thedata = np.concatenate(this_data, axis=0)
                    method(thedata)
            except KeyboardInterrupt:
                pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform

                    def post(a): return np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []

                    def post(a): return obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project

                    def post(a): return np.concatenate(a, axis=1).T

                _map = map(lambda thing: method(thing),
                           self._block_iterator(
                               flat_signal=True,
                               get=get,
                               signal_mask=signal_mask,
                               navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(
                            _map, total=nblocks, desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            try:
                loadings = _reshuffle_mixed_blocks(
                    loadings,
                    ndim,
                    (output_dimension,),
                    nav_chunks).reshape((-1, output_dimension))
            except ValueError:
                # In case the projection step was not finished, it's left
                # as scrambled
                pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio
Beispiel #4
0
    def decomposition(self,
                      normalize_poissonian_noise=False,
                      algorithm="svd",
                      output_dimension=None,
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      print_info=True,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data.

        The results are stored in ``self.learning_results``.

        Read more in the :ref:`User Guide <big_data.decomposition>`.

        Parameters
        ----------
        normalize_poissonian_noise : bool, default False
            If True, scale the signal to normalize Poissonian noise using
            the approach described in [KeenanKotula2004]_.
        algorithm : {'svd', 'pca', 'orpca', 'ornmf'}, default 'svd'
            The decomposition algorithm to use.
        output_dimension : int or None, default None
            Number of components to keep/calculate. If None, keep all
            (only valid for 'svd' algorithm)
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int or None, default None
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain at least ``output_dimension`` signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool, default True
            Reproject data on the learnt components (factors) after learning.
        print_info : bool, default True
            If True, print information about the decomposition being performed.
            In the case of sklearn.decomposition objects, this includes the
            values of all arguments of the chosen sklearn algorithm.
        **kwargs
            passed to the partial_fit/fit functions.

        References
        ----------
        .. [KeenanKotula2004] M. Keenan and P. Kotula, "Accounting for Poisson noise
            in the multivariate analysis of ToF-SIMS spectrum images", Surf.
            Interface Anal 36(3) (2004): 203-212.

        See Also
        --------
        * :py:meth:`~.learn.mva.MVA.decomposition` for non-lazy signals
        * :py:func:`dask.array.linalg.svd`
        * :py:class:`sklearn.decomposition.IncrementalPCA`
        * :py:class:`~.learn.rpca.ORPCA`
        * :py:class:`~.learn.ornmf.ORNMF`

        """
        if kwargs.get("bounds", False):
            warnings.warn(
                "The `bounds` keyword is deprecated and will be removed "
                "in v2.0. Since version > 1.3 this has no effect.",
                VisibleDeprecationWarning,
            )
            kwargs.pop("bounds", None)

        # Deprecate 'ONMF' for 'ORNMF'
        if algorithm == "ONMF":
            warnings.warn(
                "The argument `algorithm='ONMF'` has been deprecated and may "
                "be removed in future. Please use `algorithm='ornmf'` instead.",
                VisibleDeprecationWarning,
            )
            algorithm = "ornmf"

        # Deprecate uppercase to favour lowercase (consistent
        # with non-lazy decomposition)
        if algorithm in ["PCA", "ORPCA", "ORNMF"]:
            warnings.warn(
                "The argument `algorithm='{}'` has been deprecated and may "
                "be removed in future. Please use `algorithm='{}'` instead.".
                format(algorithm, algorithm.lower()),
                VisibleDeprecationWarning,
            )
            algorithm = algorithm.lower()

        # Check algorithms requiring output_dimension
        algorithms_require_dimension = ["pca", "orpca", "ornmf"]
        if algorithm in algorithms_require_dimension and output_dimension is None:
            raise ValueError(
                "`output_dimension` must be specified for '{}'".format(
                    algorithm))

        explained_variance = None
        explained_variance_ratio = None

        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])

        if output_dimension and blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)

        blocksize *= num_chunks

        # Initialize return_info and print_info
        to_return = None
        to_print = [
            "Decomposition info:", "  normalize_poissonian_noise={}".format(
                normalize_poissonian_noise),
            "  algorithm={}".format(algorithm),
            "  output_dimension={}".format(output_dimension)
        ]

        # LEARN
        if algorithm == "pca":
            if not import_sklearn.sklearn_installed:
                raise ImportError("algorithm='pca' requires scikit-learn")

            obj = import_sklearn.sklearn.decomposition.IncrementalPCA(
                n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True
            to_print.extend(["scikit-learn estimator:", obj])

        elif algorithm == "orpca":
            from hyperspy.learn.rpca import ORPCA

            batch_size = kwargs.pop("batch_size", None)
            obj = ORPCA(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        elif algorithm == "ornmf":
            from hyperspy.learn.ornmf import ORNMF

            batch_size = kwargs.pop("batch_size", None)
            obj = ORNMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        elif algorithm != "svd":
            raise ValueError("'algorithm' not recognised")

        original_data = self.data
        try:
            _logger.info("Performing decomposition analysis")

            if normalize_poissonian_noise:
                _logger.info("Scaling the data to normalize Poissonian noise")

                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(self.axes_manager.navigation_shape[::-1],
                             chunks=nav_chunks) if navigation_mask is None else
                    to_array(navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(self.axes_manager.signal_shape[::-1],
                             chunks=sig_chunks) if signal_mask is None else
                    to_array(signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=tuple(range(ndim))),
                    data.sum(axis=tuple(range(ndim, ndim + sdim))),
                )
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) +
                            (None, ) * rbH.ndim] * rbH[(None, ) * raG.ndim +
                                                       (..., )]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # LEARN
            if algorithm == "svd":
                reproject = False
                from dask.array.linalg import svd

                try:
                    self._unfolded4decomposition = self.unfold()
                    # TODO: implement masking
                    if navigation_mask or signal_mask:
                        raise NotImplementedError(
                            "Masking is not yet implemented for lazy SVD")

                    U, S, V = svd(self.data)

                    if output_dimension is None:
                        min_shape = min(min(U.shape), min(V.shape))
                    else:
                        min_shape = output_dimension

                    U = U[:, :min_shape]
                    S = S[:min_shape]
                    V = V[:min_shape]

                    factors = V.T
                    explained_variance = S**2 / self.data.shape[0]
                    loadings = U * S
                finally:
                    if self._unfolded4decomposition is True:
                        self.fold()
                        self._unfolded4decomposition is False
            else:
                this_data = []
                try:
                    for chunk in progressbar(
                            self._block_iterator(
                                flat_signal=True,
                                get=get,
                                signal_mask=signal_mask,
                                navigation_mask=navigation_mask,
                            ),
                            total=nblocks,
                            leave=True,
                            desc="Learn",
                    ):
                        this_data.append(chunk)
                        if len(this_data) == num_chunks:
                            thedata = np.concatenate(this_data, axis=0)
                            method(thedata)
                            this_data = []
                    if len(this_data):
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                except KeyboardInterrupt:
                    pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == "pca":
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == "orpca":
                factors, loadings = obj.finish()
                loadings = loadings.T

            elif algorithm == "ornmf":
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == "pca":
                    method = obj.transform

                    def post(a):
                        return np.concatenate(a, axis=0)

                elif algorithm == "orpca":
                    method = obj.project

                    def post(a):
                        return np.concatenate(a, axis=1).T

                elif algorithm == "ornmf":
                    method = obj.project

                    def post(a):
                        return np.concatenate(a, axis=1).T

                _map = map(
                    lambda thing: method(thing),
                    self._block_iterator(
                        flat_signal=True,
                        get=get,
                        signal_mask=signal_mask,
                        navigation_mask=navigation_mask,
                    ),
                )
                H = []
                try:
                    for thing in progressbar(_map,
                                             total=nblocks,
                                             desc="Project"):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and explained_variance_ratio is None:
                explained_variance_ratio = explained_variance / explained_variance.sum(
                )

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            if algorithm != "svd":  # Only needed for online algorithms
                try:
                    loadings = _reshuffle_mixed_blocks(loadings, ndim,
                                                       (output_dimension, ),
                                                       nav_chunks).reshape(
                                                           (-1,
                                                            output_dimension))
                except ValueError:
                    # In case the projection step was not finished, it's left
                    # as scrambled
                    pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        if algorithm != "svd":
            target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio

        # Rescale the results if the noise was normalized
        if normalize_poissonian_noise is True:
            target.factors = target.factors * rbH.ravel()[:, np.newaxis]
            target.loadings = target.loadings * raG.ravel()[:, np.newaxis]

        # Print details about the decomposition we just performed
        if print_info:
            print("\n".join([str(pr) for pr in to_print]))
Beispiel #5
0
def test_arithmetic():
    x = np.arange(5).astype('f4') + 2
    y = np.arange(5).astype('i8') + 2
    z = np.arange(5).astype('i4') + 2
    a = da.from_array(x, chunks=(2, ))
    b = da.from_array(y, chunks=(2, ))
    c = da.from_array(z, chunks=(2, ))
    assert eq(a + b, x + y)
    assert eq(a * b, x * y)
    assert eq(a - b, x - y)
    assert eq(a / b, x / y)
    assert eq(b & b, y & y)
    assert eq(b | b, y | y)
    assert eq(b ^ b, y ^ y)
    assert eq(a // b, x // y)
    assert eq(a**b, x**y)
    assert eq(a % b, x % y)
    assert eq(a > b, x > y)
    assert eq(a < b, x < y)
    assert eq(a >= b, x >= y)
    assert eq(a <= b, x <= y)
    assert eq(a == b, x == y)
    assert eq(a != b, x != y)

    assert eq(a + 2, x + 2)
    assert eq(a * 2, x * 2)
    assert eq(a - 2, x - 2)
    assert eq(a / 2, x / 2)
    assert eq(b & True, y & True)
    assert eq(b | True, y | True)
    assert eq(b ^ True, y ^ True)
    assert eq(a // 2, x // 2)
    assert eq(a**2, x**2)
    assert eq(a % 2, x % 2)
    assert eq(a > 2, x > 2)
    assert eq(a < 2, x < 2)
    assert eq(a >= 2, x >= 2)
    assert eq(a <= 2, x <= 2)
    assert eq(a == 2, x == 2)
    assert eq(a != 2, x != 2)

    assert eq(2 + b, 2 + y)
    assert eq(2 * b, 2 * y)
    assert eq(2 - b, 2 - y)
    assert eq(2 / b, 2 / y)
    assert eq(True & b, True & y)
    assert eq(True | b, True | y)
    assert eq(True ^ b, True ^ y)
    assert eq(2 // b, 2 // y)
    assert eq(2**b, 2**y)
    assert eq(2 % b, 2 % y)
    assert eq(2 > b, 2 > y)
    assert eq(2 < b, 2 < y)
    assert eq(2 >= b, 2 >= y)
    assert eq(2 <= b, 2 <= y)
    assert eq(2 == b, 2 == y)
    assert eq(2 != b, 2 != y)

    assert eq(-a, -x)
    assert eq(abs(a), abs(x))
    assert eq(~(a == b), ~(x == y))
    assert eq(~(a == b), ~(x == y))

    assert eq(da.logaddexp(a, b), np.logaddexp(x, y))
    assert eq(da.logaddexp2(a, b), np.logaddexp2(x, y))
    assert eq(da.exp(b), np.exp(y))
    assert eq(da.log(a), np.log(x))
    assert eq(da.log10(a), np.log10(x))
    assert eq(da.log1p(a), np.log1p(x))
    assert eq(da.expm1(b), np.expm1(y))
    assert eq(da.sqrt(a), np.sqrt(x))
    assert eq(da.square(a), np.square(x))

    assert eq(da.sin(a), np.sin(x))
    assert eq(da.cos(b), np.cos(y))
    assert eq(da.tan(a), np.tan(x))
    assert eq(da.arcsin(b / 10), np.arcsin(y / 10))
    assert eq(da.arccos(b / 10), np.arccos(y / 10))
    assert eq(da.arctan(b / 10), np.arctan(y / 10))
    assert eq(da.arctan2(b * 10, a), np.arctan2(y * 10, x))
    assert eq(da.hypot(b, a), np.hypot(y, x))
    assert eq(da.sinh(a), np.sinh(x))
    assert eq(da.cosh(b), np.cosh(y))
    assert eq(da.tanh(a), np.tanh(x))
    assert eq(da.arcsinh(b * 10), np.arcsinh(y * 10))
    assert eq(da.arccosh(b * 10), np.arccosh(y * 10))
    assert eq(da.arctanh(b / 10), np.arctanh(y / 10))
    assert eq(da.deg2rad(a), np.deg2rad(x))
    assert eq(da.rad2deg(a), np.rad2deg(x))

    assert eq(da.logical_and(a < 1, b < 4), np.logical_and(x < 1, y < 4))
    assert eq(da.logical_or(a < 1, b < 4), np.logical_or(x < 1, y < 4))
    assert eq(da.logical_xor(a < 1, b < 4), np.logical_xor(x < 1, y < 4))
    assert eq(da.logical_not(a < 1), np.logical_not(x < 1))
    assert eq(da.maximum(a, 5 - a), np.maximum(a, 5 - a))
    assert eq(da.minimum(a, 5 - a), np.minimum(a, 5 - a))
    assert eq(da.fmax(a, 5 - a), np.fmax(a, 5 - a))
    assert eq(da.fmin(a, 5 - a), np.fmin(a, 5 - a))

    assert eq(da.isreal(a + 1j * b), np.isreal(x + 1j * y))
    assert eq(da.iscomplex(a + 1j * b), np.iscomplex(x + 1j * y))
    assert eq(da.isfinite(a), np.isfinite(x))
    assert eq(da.isinf(a), np.isinf(x))
    assert eq(da.isnan(a), np.isnan(x))
    assert eq(da.signbit(a - 3), np.signbit(x - 3))
    assert eq(da.copysign(a - 3, b), np.copysign(x - 3, y))
    assert eq(da.nextafter(a - 3, b), np.nextafter(x - 3, y))
    assert eq(da.ldexp(c, c), np.ldexp(z, z))
    assert eq(da.fmod(a * 12, b), np.fmod(x * 12, y))
    assert eq(da.floor(a * 0.5), np.floor(x * 0.5))
    assert eq(da.ceil(a), np.ceil(x))
    assert eq(da.trunc(a / 2), np.trunc(x / 2))

    assert eq(da.degrees(b), np.degrees(y))
    assert eq(da.radians(a), np.radians(x))

    assert eq(da.rint(a + 0.3), np.rint(x + 0.3))
    assert eq(da.fix(a - 2.5), np.fix(x - 2.5))

    assert eq(da.angle(a + 1j), np.angle(x + 1j))
    assert eq(da.real(a + 1j), np.real(x + 1j))
    assert eq((a + 1j).real, np.real(x + 1j))
    assert eq(da.imag(a + 1j), np.imag(x + 1j))
    assert eq((a + 1j).imag, np.imag(x + 1j))
    assert eq(da.conj(a + 1j * b), np.conj(x + 1j * y))
    assert eq((a + 1j * b).conj(), (x + 1j * y).conj())

    assert eq(da.clip(b, 1, 4), np.clip(y, 1, 4))
    assert eq(da.fabs(b), np.fabs(y))
    assert eq(da.sign(b - 2), np.sign(y - 2))

    l1, l2 = da.frexp(a)
    r1, r2 = np.frexp(x)
    assert eq(l1, r1)
    assert eq(l2, r2)

    l1, l2 = da.modf(a)
    r1, r2 = np.modf(x)
    assert eq(l1, r1)
    assert eq(l2, r2)

    assert eq(da.around(a, -1), np.around(x, -1))
Beispiel #6
0
    def decomposition(self,
                      normalize_poissonian_noise=False,
                      algorithm='svd',
                      output_dimension=None,
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=False,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd',
            lazy SVD decomposition from dask.
        output_dimension : int
            the number of significant components to keep. If None, keep all
            (only valid for SVD)
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        if bounds:
            msg = ("The `bounds` keyword is deprecated and will be removed "
                   "in v2.0. Since version > 1.3 this has no effect.")
            warnings.warn(msg, VisibleDeprecationWarning)
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if algorithm != "svd" and output_dimension is None:
            raise ValueError("With the %s the output_dimension "
                             "must be specified" % algorithm)
        if output_dimension and blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks
        # LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)
        elif algorithm != "svd":
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(self.axes_manager.navigation_shape[::-1],
                             chunks=nav_chunks) if navigation_mask is None else
                    to_array(navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(self.axes_manager.signal_shape[::-1],
                             chunks=sig_chunks) if signal_mask is None else
                    to_array(signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(data.sum(axis=range(ndim)),
                                    data.sum(axis=range(ndim, ndim + sdim)))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, ) * rbH.ndim] *\
                    rbH[(None, ) * raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # LEARN
            if algorithm == "svd":
                reproject = False
                from dask.array.linalg import svd
                try:
                    self._unfolded4decomposition = self.unfold()
                    # TODO: implement masking
                    if navigation_mask or signal_mask:
                        raise NotImplemented(
                            "Masking is not yet implemented for lazy SVD.")
                    U, S, V = svd(self.data)
                    factors = V.T
                    explained_variance = S**2 / self.data.shape[0]
                    loadings = U * S
                finally:
                    if self._unfolded4decomposition is True:
                        self.fold()
                        self._unfolded4decomposition is False
            else:
                this_data = []
                try:
                    for chunk in progressbar(self._block_iterator(
                            flat_signal=True,
                            get=get,
                            signal_mask=signal_mask,
                            navigation_mask=navigation_mask),
                                             total=nblocks,
                                             leave=True,
                                             desc='Learn'):
                        this_data.append(chunk)
                        if len(this_data) == num_chunks:
                            thedata = np.concatenate(this_data, axis=0)
                            method(thedata)
                            this_data = []
                    if len(this_data):
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                except KeyboardInterrupt:
                    pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform

                    def post(a):
                        return np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []

                    def post(a):
                        return obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project

                    def post(a):
                        return np.concatenate(a, axis=1).T

                _map = map(
                    lambda thing: method(thing),
                    self._block_iterator(flat_signal=True,
                                         get=get,
                                         signal_mask=signal_mask,
                                         navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(_map,
                                             total=nblocks,
                                             desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            if algorithm != "svd":  # Only needed for online algorithms
                try:
                    loadings = _reshuffle_mixed_blocks(loadings, ndim,
                                                       (output_dimension, ),
                                                       nav_chunks).reshape(
                                                           (-1,
                                                            output_dimension))
                except ValueError:
                    # In case the projection step was not finished, it's left
                    # as scrambled
                    pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        if algorithm != "svd":
            target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio

        # Rescale the results if the noise was normalized
        if normalize_poissonian_noise is True:
            target.factors = target.factors * rbH.ravel()[:, np.newaxis]
            target.loadings = target.loadings * raG.ravel()[:, np.newaxis]
Beispiel #7
0
    def decomposition(self,
                      output_dimension,
                      normalize_poissonian_noise=False,
                      algorithm='PCA',
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=True,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        output_dimension : int
            the number of significant components to keep
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA
            from scikit-learn is run.
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        bounds : {tuple, bool}
            The (min, max) values of the data to normalize before learning.
            If tuple (min, max), those values will be used for normalization.
            If True, extremes will be looked up (expensive), default.
            If False, no normalization is done (learning may be very slow).
            If normalize_poissonian_noise is True, this cannot be True.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks

        ## LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        else:
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                if bounds is True:
                    bounds = False
                    # warnings.warn?
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(
                        self.axes_manager.navigation_shape[::-1],
                        chunks=nav_chunks)
                    if navigation_mask is None else to_array(
                        navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(
                        self.axes_manager.signal_shape[::-1],
                        chunks=sig_chunks)
                    if signal_mask is None else to_array(
                        signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=range(ndim)),
                    data.sum(axis=range(ndim, ndim + sdim)))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, )*rbH.ndim] *\
                        rbH[(None, )*raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # normalize the data for learning algs:
            if bounds:
                if bounds is True:
                    _min, _max = da.compute(self.data.min(), self.data.max())
                else:
                    _min, _max = bounds
                self.data = (self.data - _min) / (_max - _min)

            # LEARN
            this_data = []
            try:
                for chunk in progressbar(
                        self._block_iterator(
                            flat_signal=True,
                            get=get,
                            signal_mask=signal_mask,
                            navigation_mask=navigation_mask),
                        total=nblocks,
                        leave=True,
                        desc='Learn'):
                    this_data.append(chunk)
                    if len(this_data) == num_chunks:
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                        this_data = []
                if len(this_data):
                    thedata = np.concatenate(this_data, axis=0)
                    method(thedata)
            except KeyboardInterrupt:
                pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform
                    post = lambda a: np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []
                    post = lambda a: obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project
                    post = lambda a: np.concatenate(a, axis=1).T

                _map = map(lambda thing: method(thing),
                           self._block_iterator(
                               flat_signal=True,
                               get=get,
                               signal_mask=signal_mask,
                               navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(
                            _map, total=nblocks, desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            try:
                loadings = _reshuffle_mixed_blocks(
                    loadings,
                    ndim,
                    (output_dimension,),
                    nav_chunks).reshape((-1, output_dimension))
            except ValueError:
                # In case the projection step was not finished, it's left
                # as scrambled
                pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio
Beispiel #8
0
    def decomposition(self,
                      normalize_poissonian_noise=False,
                      algorithm='svd',
                      output_dimension=None,
                      signal_mask=None,
                      navigation_mask=None,
                      get=threaded.get,
                      num_chunks=None,
                      reproject=True,
                      bounds=False,
                      **kwargs):
        """Perform Incremental (Batch) decomposition on the data, keeping n
        significant components.

        Parameters
        ----------
        normalize_poissonian_noise : bool
            If True, scale the SI to normalize Poissonian noise
        algorithm : str
            One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd',
            lazy SVD decomposition from dask.
        output_dimension : int
            the number of significant components to keep. If None, keep all
            (only valid for SVD)
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain atleast output_dimension signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
            decompostion.
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
            decomposition.
        reproject : bool
            Reproject data on the learnt components (factors) after learning.
        **kwargs
            passed to the partial_fit/fit functions.

        Notes
        -----
        Various algorithm parameters and their default values:
            ONMF:
                lambda1=1,
                kappa=1,
                robust=False,
                store_r=False
                batch_size=None
            ORPCA:
                fast=True,
                lambda1=None,
                lambda2=None,
                method=None,
                learning_rate=None,
                init=None,
                training_samples=None,
                momentum=None
            PCA:
                batch_size=None,
                copy=True,
                white=False


        """
        if bounds:
            msg = (
                "The `bounds` keyword is deprecated and will be removed "
                "in v2.0. Since version > 1.3 this has no effect.")
            warnings.warn(msg, VisibleDeprecationWarning)
        explained_variance = None
        explained_variance_ratio = None
        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])
        if algorithm != "svd" and output_dimension is None:
            raise ValueError("With the %s the output_dimension "
                             "must be specified" % algorithm)
        if output_dimension and blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)
        blocksize *= num_chunks
        # LEARN
        if algorithm == 'PCA':
            from sklearn.decomposition import IncrementalPCA
            obj = IncrementalPCA(n_components=output_dimension)
            method = partial(obj.partial_fit, **kwargs)
            reproject = True

        elif algorithm == 'ORPCA':
            from hyperspy.learn.rpca import ORPCA
            kwg = {'fast': True}
            kwg.update(kwargs)
            obj = ORPCA(output_dimension, **kwg)
            method = partial(obj.fit, iterating=True)

        elif algorithm == 'ONMF':
            from hyperspy.learn.onmf import ONMF
            batch_size = kwargs.pop('batch_size', None)
            obj = ONMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)
        elif algorithm != "svd":
            raise ValueError('algorithm not known')

        original_data = self.data
        try:
            if normalize_poissonian_noise:
                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                    da.zeros(
                        self.axes_manager.navigation_shape[::-1],
                        chunks=nav_chunks)
                    if navigation_mask is None else to_array(
                        navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                    da.zeros(
                        self.axes_manager.signal_shape[::-1],
                        chunks=sig_chunks)
                    if signal_mask is None else to_array(
                        signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=tuple(range(ndim))),
                    data.sum(axis=tuple(range(ndim, ndim + sdim))))
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) + (None, ) * rbH.ndim] *\
                    rbH[(None, ) * raG.ndim + (...,)]
                coeff.map_blocks(np.nan_to_num)
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # LEARN
            if algorithm == "svd":
                reproject = False
                from dask.array.linalg import svd
                try:
                    self._unfolded4decomposition = self.unfold()
                    # TODO: implement masking
                    if navigation_mask or signal_mask:
                        raise NotImplemented(
                            "Masking is not yet implemented for lazy SVD."
                        )
                    U, S, V = svd(self.data)
                    factors = V.T
                    explained_variance = S ** 2 / self.data.shape[0]
                    loadings = U * S
                finally:
                    if self._unfolded4decomposition is True:
                        self.fold()
                        self._unfolded4decomposition is False
            else:
                this_data = []
                try:
                    for chunk in progressbar(
                            self._block_iterator(
                                flat_signal=True,
                                get=get,
                                signal_mask=signal_mask,
                                navigation_mask=navigation_mask),
                            total=nblocks,
                            leave=True,
                            desc='Learn'):
                        this_data.append(chunk)
                        if len(this_data) == num_chunks:
                            thedata = np.concatenate(this_data, axis=0)
                            method(thedata)
                            this_data = []
                    if len(this_data):
                        thedata = np.concatenate(this_data, axis=0)
                        method(thedata)
                except KeyboardInterrupt:
                    pass

            # GET ALREADY CALCULATED RESULTS
            if algorithm == 'PCA':
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == 'ORPCA':
                _, _, U, S, V = obj.finish()
                factors = U * S
                loadings = V
                explained_variance = S**2 / len(factors)

            elif algorithm == 'ONMF':
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == 'PCA':
                    method = obj.transform

                    def post(a): return np.concatenate(a, axis=0)
                elif algorithm == 'ORPCA':
                    method = obj.project
                    obj.R = []

                    def post(a): return obj.finish()[4]
                elif algorithm == 'ONMF':
                    method = obj.project

                    def post(a): return np.concatenate(a, axis=1).T

                _map = map(lambda thing: method(thing),
                           self._block_iterator(
                               flat_signal=True,
                               get=get,
                               signal_mask=signal_mask,
                               navigation_mask=navigation_mask))
                H = []
                try:
                    for thing in progressbar(
                            _map, total=nblocks, desc='Project'):
                        H.append(thing)
                except KeyboardInterrupt:
                    pass
                loadings = post(H)

            if explained_variance is not None and \
                    explained_variance_ratio is None:
                explained_variance_ratio = \
                    explained_variance / explained_variance.sum()

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            if algorithm != "svd":  # Only needed for online algorithms
                try:
                    loadings = _reshuffle_mixed_blocks(
                        loadings,
                        ndim,
                        (output_dimension,),
                        nav_chunks).reshape((-1, output_dimension))
                except ValueError:
                    # In case the projection step was not finished, it's left
                    # as scrambled
                    pass
        finally:
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        if algorithm != "svd":
            target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio

        # Rescale the results if the noise was normalized
        if normalize_poissonian_noise is True:
            target.factors = target.factors * rbH.ravel()[:, np.newaxis]
            target.loadings = target.loadings * raG.ravel()[:, np.newaxis]
Beispiel #9
0
    def importData(self, samplesToRun, prescale = True, ptReweight=True, randomize = True):

      #check if this file was cached 
      if (samplesToRun, prescale, ptReweight) in self.dataMap:
        npyInputData, npyInputAnswers, npyInputWgts, npyInputSampleWgts = self.dataMap[samplesToRun, prescale, ptReweight]

      else:
        #variables to train
        vars = self.getList()
        
        inputData = np.empty([0])
        npyInputWgts = np.empty([0])
        
        import h5py

        variables = vars

        f = h5py.File(samplesToRun[0], "r")
        columnHeaders = f["reco_candidates"].attrs["column_headers"]
        f.close()

        for v in variables:
            if not v in columnHeaders:
                print "Variable not found: %s"%v

        dataColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in variables])

        ptColumnsName = ["cand_pt"]
        ptColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in ptColumnsName])
        
        labelColumnNames = ["genConstiuentMatchesVec", "genTopMatchesVec", "ncand"]
        labelColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in labelColumnNames])
        
        wgtColumnNames = ["sampleWgt"]
        wgtColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in wgtColumnNames])
        
        #load data files 
        dsets = [h5py.File(filename, mode='r')['reco_candidates'] for filename in samplesToRun]
        
        arrays = [da.from_array(dset, chunks=(65536, 1024)) for dset in dsets]
        
        x = da.concatenate(arrays, axis=0)
        
        data = x[:,dataColumns]
        
        #remove partial tops 
        inputLabels = x[:,labelColumns]
        inputAnswer = (inputLabels[:,0] > 2.99) & (inputLabels[:,1] > 0.99)
    
        inputBackground = (inputLabels[:,0] == 0) & da.logical_not(inputLabels[:,1])
    
        filterArray = []
        if self.signal and self.background:
            filterArray = ((inputAnswer == 1) | (inputBackground == 1)) & (inputLabels[:,2] > 0)
        elif self.signal:
            filterArray = (inputAnswer == 1) & (inputLabels[:,2] > 0)
        elif self.background:
            filterArray = (inputBackground == 1) & (inputLabels[:,2] > 0)

        npyInputData = data[filterArray].compute()
        #npyInputLabels = inputData.as_matrix(["genConstiuentMatchesVec", "genTopMatchesVec"])
        npyInputAnswer = inputAnswer
        if self.include:
            npyInputAnswers = da.vstack([npyInputAnswer,da.logical_not(npyInputAnswer)]).transpose()[filterArray].compute()
        else:
            npyInputAnswers = np.zeros((npyInputData.shape[0], 2))
        npyInputSampleWgts = x[:,wgtColumns][filterArray].compute()
        dataPt = x[:,ptColumns][filterArray].compute()
        npyInputWgts = np.ones(len(npyInputSampleWgts)).reshape([-1,1])

        if self.weightHist != None:
            ptBins = self.weightHist[1]
            ptWeightHist = self.weightHist[0]
            npyInputWgts *= ptWeightHist[np.digitize(dataPt, ptBins) - 1].reshape([-1,1])

        d = np.zeros((npyInputData.shape[0], 2))
        if self.domain != None and self.domain > 0:
            d[:,self.domain - 1] = 1

        return {"data":npyInputData, "labels":npyInputAnswers, "domain":d, "weights":npyInputWgts, "":npyInputSampleWgts}
Beispiel #10
0
    def get_value(self, group, corr, extras, flag, flag_row, chanslice):
        coldata = self.get_column_data(group)
        # correlation may be pre-set by plot type, or may be passed to us
        corr = self.corr if self.corr is not None else corr
        # apply correlation reduction
        if coldata is not None and coldata.ndim == 3:
            assert corr is not None
            # the mapper can't have a specific axis set
            if self.mapper.axis is not None:
                raise TypeError(f"{self.name}: unexpected column with ndim=3")
            coldata = self.ms.corr_data_mappers[corr](coldata)
        # apply mapping function
        mapper = self.mapper
        # complex values with an identity mapper get an amp mapper assigned to them by default
        if np.iscomplexobj(coldata) and mapper is data_mappers["_"]:
            mapper = data_mappers["amp"]
        coldata = mapper.mapper(
            coldata, **{name: extras[name]
                        for name in self.mapper.extras})
        # for a constant axis, compute minmax on the fly
        if mapper.const and self._minmax_autorange:
            if np.isscalar(coldata):
                min1 = max1 = coldata
            else:
                min1, max1 = coldata.data.min(), coldata.data.max()
            self.minmax = min(self.minmax[0], min1) if self.minmax[0] is not None else min1, \
                          min(self.minmax[1], max1) if self.minmax[1] is not None else max1
        # scalar is just a scalar
        if np.isscalar(coldata):
            coldata = da.array(coldata)
            flag = None
        else:
            # apply channel slicing, if there's a channel axis in the array (and the array is a DataArray)
            if type(coldata) is xarray.DataArray and 'chan' in coldata.dims:
                coldata = coldata[dict(chan=chanslice)]
            # determine flags -- start with original flags
            if flag is not None:
                if coldata.ndim == 2:
                    flag = self.ms.corr_flag_mappers[corr](flag)
                elif coldata.ndim == 1:
                    if not self.mapper.axis:
                        flag = flag_row
                    elif self.mapper.axis == 1:
                        flag = None
                # shapes must now match
                if flag is not None and coldata.shape != flag.shape:
                    raise TypeError(f"{self.name}: unexpected column shape")
        # # discretize
        # if self.nlevels:

        if coldata.dtype is bool or np.issubdtype(coldata.dtype, np.integer):
            if self._is_discrete is False:
                raise TypeError(
                    f"{self.label}: column changed from continuous-valued to discrete. This is a bug, or a very weird MS."
                )
            self._is_discrete = True
            # do we need to apply a remapping?
            if self.subset_remapper is not None:
                if type(
                        coldata
                ) is not dask.array.core.Array:  # could be xarray backed by dask array
                    coldata = coldata.data
                coldata = self.subset_remapper[coldata]
                bad_bins = da.greater_equal(coldata, len(self.subset_indices))
                if flag is None:
                    flag = bad_bins
                else:
                    flag = da.logical_or(flag.data, bad_bins)
        else:
            if self._is_discrete is True:
                raise TypeError(
                    f"{self.label}: column chnaged from discrete to continuous-valued. This is a bug, or a very weird MS."
                )
            self._is_discrete = False

        # Ensure dask arrays for creating dask masked arrays
        if isinstance(coldata, xarray.DataArray):
            coldata = coldata.data

        if isinstance(flag, xarray.DataArray):
            flag = flag.data

        bad_data = da.logical_not(da.isfinite(coldata))
        if flag is not None:
            return dama.masked_array(coldata, da.logical_or(flag, bad_data))
        else:
            return dama.masked_array(coldata, bad_data)