コード例 #1
def load_only_data(filename, array_shape, record_by, num_axes, data=None,
                   header=None, only_valid_data=False):
    if data is None:
        header, data = load_ser_file(filename)
    # If the acquisition stops before finishing the job, the stored file will
    # report the requested size even though no values are recorded. Therefore
    # if the shapes of the retrieved array does not match that of the data
    # dimensions we must fill the rest with zeros or (better) nans if the
    # dtype is float
    if multiply(array_shape) != multiply(data['Array'].shape):
        if int(header['NumberDimensions']) == 1 and only_valid_data:
            # No need to fill with zeros if `TotalNumberElements !=
            # ValidNumberElements` for series data.
            # The valid data is always `0:ValidNumberElements`
            dc = data['Array'][0:header['ValidNumberElements'][0], ...]
            array_shape[0] = header['ValidNumberElements'][0]
            # Maps will need to be filled with zeros or nans
            dc = np.zeros(multiply(array_shape),
            if dc.dtype is np.dtype('f') or dc.dtype is np.dtype('f8'):
                dc[:] = np.nan
            dc[:data['Array'].ravel().shape[0]] = data['Array'].ravel()
        dc = data['Array']

    dc = dc.reshape(array_shape)
    if record_by == 'image':
        dc = dc[..., ::-1, :]
    if num_axes != len(dc.shape):
        dc = dc.squeeze()
    if num_axes != len(dc.shape):
        raise IOError("Please report this issue to the HyperSpy developers.")
    return dc
コード例 #2
ファイル: fei.py プロジェクト: chrinide/hyperspy
def load_only_data(filename, array_shape, record_by, num_axes, data=None):
    if data is None:
        _, data = load_ser_file(filename)
    # If the acquisition stops before finishing the job, the stored file will
    # report the requested size even though no values are recorded. Therefore
    # if the shapes of the retrieved array does not match that of the data
    # dimensions we must fill the rest with zeros or (better) nans if the
    # dtype is float
    if multiply(array_shape) != multiply(data['Array'].shape):
        dc = np.zeros(multiply(array_shape),
        if dc.dtype is np.dtype('f') or dc.dtype is np.dtype('f8'):
            dc[:] = np.nan
        dc[:data['Array'].ravel().shape[0]] = data['Array'].ravel()
        dc = data['Array']

    dc = dc.reshape(array_shape)
    if record_by == 'image':
        dc = dc[..., ::-1, :]
    if num_axes != len(dc.shape):
        dc = dc.squeeze()
    if num_axes != len(dc.shape):
        raise IOError("Please report this issue to the HyperSpy developers.")
    return dc
コード例 #3
ファイル: fei.py プロジェクト: mwalls/hyperspy
def load_only_data(filename, array_shape, record_by, num_axes, data=None):
    if data is None:
        _, data = load_ser_file(filename)
    # If the acquisition stops before finishing the job, the stored file will
    # report the requested size even though no values are recorded. Therefore
    # if the shapes of the retrieved array does not match that of the data
    # dimensions we must fill the rest with zeros or (better) nans if the
    # dtype is float
    if multiply(array_shape) != multiply(data['Array'].shape):
        dc = np.zeros(multiply(array_shape),
        if dc.dtype is np.dtype('f') or dc.dtype is np.dtype('f8'):
            dc[:] = np.nan
        dc[:data['Array'].ravel().shape[0]] = data['Array'].ravel()
        dc = data['Array']

    dc = dc.reshape(array_shape)
    if record_by == 'image':
        dc = dc[..., ::-1, :]
    if num_axes != len(dc.shape):
        dc = dc.squeeze()
    if num_axes != len(dc.shape):
        raise IOError("Please report this issue to the HyperSpy developers.")
    return dc
コード例 #4
def get_signal_chunks(shape, dtype, signal_axes=None):
    """Function that claculates chunks for the signal, preferably at least one
    chunk per signal space.

    shape : tuple
        the shape of the dataset to be sored / chunked
    dtype : {dtype, string}
        the numpy dtype of the data
    signal_axes: {None, iterable of ints}
        the axes defining "signal space" of the dataset. If None, the default
        h5py chunking is performed.
    typesize = np.dtype(dtype).itemsize
    if signal_axes is None:
        return h5py._hl.filters.guess_chunk(shape, None, typesize)

    # largely based on the guess_chunk in h5py
    CHUNK_MAX = 1024 * 1024
    want_to_keep = multiply([shape[i] for i in signal_axes]) * typesize
    if want_to_keep >= CHUNK_MAX:
        chunks = [1 for _ in shape]
        for i in signal_axes:
            chunks[i] = shape[i]
        return tuple(chunks)

    chunks = [i for i in shape]
    idx = 0
    navigation_axes = tuple(i for i in range(len(shape))
                            if i not in signal_axes)
    nchange = len(navigation_axes)
    while True:
        chunk_bytes = multiply(chunks) * typesize

        if chunk_bytes < CHUNK_MAX:

        if multiply([chunks[i] for i in navigation_axes]) == 1:
        change = navigation_axes[idx % nchange]
        chunks[change] = np.ceil(chunks[change] / 2.0)
        idx += 1
    return tuple(int(x) for x in chunks)
コード例 #5
ファイル: hspy.py プロジェクト: woozey/hyperspy
def get_signal_chunks(shape, dtype, signal_axes=None):
    """Function that claculates chunks for the signal, preferably at least one
    chunk per signal space.

    shape : tuple
        the shape of the dataset to be sored / chunked
    dtype : {dtype, string}
        the numpy dtype of the data
    signal_axes: {None, iterable of ints}
        the axes defining "signal space" of the dataset. If None, the default
        h5py chunking is performed.
    typesize = np.dtype(dtype).itemsize
    if signal_axes is None:
        return h5py._hl.filters.guess_chunk(shape, None, typesize)

    # largely based on the guess_chunk in h5py
    CHUNK_MAX = 1024 * 1024
    want_to_keep = multiply([shape[i] for i in signal_axes]) * typesize
    if want_to_keep >= CHUNK_MAX:
        chunks = [1 for _ in shape]
        for i in signal_axes:
            chunks[i] = shape[i]
        return tuple(chunks)

    chunks = [i for i in shape]
    idx = 0
    navigation_axes = tuple(i for i in range(len(shape)) if i not in
    nchange = len(navigation_axes)
    while True:
        chunk_bytes = multiply(chunks) * typesize

        if chunk_bytes < CHUNK_MAX:

        if multiply([chunks[i] for i in navigation_axes]) == 1:
        change = navigation_axes[idx % nchange]
        chunks[change] = np.ceil(chunks[change] / 2.0)
        idx += 1
    return tuple(int(x) for x in chunks)
コード例 #6
ファイル: _hierarchical.py プロジェクト: jat255/hyperspy
def get_signal_chunks(shape, dtype, signal_axes=None, target_size=1e6):
    Function that calculates chunks for the signal, preferably at least one
    chunk per signal space.

    shape : tuple
        The shape of the dataset to be stored / chunked.
    dtype : {dtype, string}
        The numpy dtype of the data.
    signal_axes : {None, iterable of ints}
        The axes defining "signal space" of the dataset. If None, the default
        h5py chunking is performed.
    target_size : int
        The target number of bytes for one chunk
    typesize = np.dtype(dtype).itemsize
    if signal_axes is None:
        return h5py._hl.filters.guess_chunk(shape, None, typesize)

    # largely based on the guess_chunk in h5py
    bytes_per_signal = multiply([shape[i] for i in signal_axes]) * typesize
    signals_per_chunk = int(np.floor_divide(target_size, bytes_per_signal))
    navigation_axes = tuple(i for i in range(len(shape)) if i not in
    num_nav_axes = len(navigation_axes)
    num_signals = np.prod([shape[i] for i in navigation_axes])
    if signals_per_chunk < 2 or num_nav_axes==0:
        # signal is larger than chunk max
        chunks = [s if i in signal_axes else 1 for i, s in enumerate(shape)]
        return tuple(chunks)
    elif signals_per_chunk > num_signals:
        return shape
        # signal is smaller than chunk max
        # Index of axes with size smaller than required to make all chunks equal
        small_idx = []
        # Sizes of axes with size smaller than required to make all chunks equal
        small_sizes = []
        iterate = True
        while iterate:
            iterate = False
            # Calculate the size of the chunks of the axes not in `small_idx`
            # The process is iterative because `nav_axes_chunks` can be bigger
            # than some axes sizes. If that is the case, the value must be
            # recomputed at the next iteration after having added the "offending"
            # axes to `small_idx`
            nav_axes_chunks = int(np.floor((signals_per_chunk / np.prod(small_sizes))**(1 / (num_nav_axes - len(small_sizes)))))
            for index, size in enumerate(shape):
                if index not in (list(signal_axes) + small_idx) and size < nav_axes_chunks:
                    iterate = True
        chunks = [s if i in signal_axes or i in small_idx else nav_axes_chunks for i, s in enumerate(shape)]
        return tuple(int(x) for x in chunks)
コード例 #7
def get_signal_chunks(shape, dtype, signal_axes=None, target_size=1e6):
    Function that calculates chunks for the signal, preferably at least one
    chunk per signal space.

    shape : tuple
        The shape of the dataset to be stored / chunked.
    dtype : {dtype, string}
        The numpy dtype of the data.
    signal_axes : {None, iterable of ints}
        The axes defining "signal space" of the dataset. If None, the default
        h5py chunking is performed.
    target_size : int
        The target number of bytes for one chunk
    typesize = np.dtype(dtype).itemsize
    if signal_axes is None:
        return h5py._hl.filters.guess_chunk(shape, None, typesize)

    # largely based on the guess_chunk in h5py
    bytes_per_signal = multiply([shape[i] for i in signal_axes]) * typesize
    signals_per_chunk = np.floor_divide(target_size, bytes_per_signal)
    navigation_axes = tuple(i for i in range(len(shape))
                            if i not in signal_axes)
    num_nav_axes = len(navigation_axes)
    num_signals = np.prod([shape[i] for i in navigation_axes])
    if signals_per_chunk < 2 or num_nav_axes == 0:
        # signal is larger than chunk max
        chunks = [s if i in signal_axes else 1 for i, s in enumerate(shape)]
        return tuple(chunks)
    elif signals_per_chunk > num_signals:
        return shape
        # signal is smaller than chunk max
        sig_axes_chunk = np.floor(signals_per_chunk**(1 / num_nav_axes))
        remainder = np.floor_divide(
            signals_per_chunk - (sig_axes_chunk**num_nav_axes), sig_axes_chunk)
        if remainder < 0:
            remainder = 0
        chunks = [
            s if i in signal_axes else sig_axes_chunk
            for i, s in enumerate(shape)
        chunks[navigation_axes[0]] = chunks[navigation_axes[0]] + remainder
        return tuple(int(x) for x in chunks)
コード例 #8
ファイル: lazy.py プロジェクト: pc494/hyperspy
def _reshuffle_mixed_blocks(array, ndim, sshape, nav_chunks):
    """Reshuffles dask block-shuffled array

    array : np.ndarray
        the array to reshuffle
    ndim : int
        the number of navigation (shuffled) dimensions
    sshape : tuple of ints
        The shape
    splits = np.cumsum([multiply(ar)
                        for ar in product(*nav_chunks)][:-1]).tolist()
    if splits:
        all_chunks = [
            ar.reshape(shape + sshape)
            for shape, ar in zip(product(*nav_chunks), np.split(array, splits))

        def split_stack_list(what, step, axis):
            total = len(what)
            if total != step:
                return [
                    np.concatenate(what[i:i + step], axis=axis)
                    for i in range(0, total, step)
                return np.concatenate(what, axis=axis)

        for chunks, axis in zip(nav_chunks[::-1], range(ndim - 1, -1, -1)):
            step = len(chunks)
            all_chunks = split_stack_list(all_chunks, step, axis)
        return all_chunks
        return array
コード例 #9
ファイル: lazy.py プロジェクト: pc494/hyperspy
    def decomposition(self,
        """Perform Incremental (Batch) decomposition on the data.

        The results are stored in ``self.learning_results``.

        Read more in the :ref:`User Guide <big_data.decomposition>`.

        normalize_poissonian_noise : bool, default False
            If True, scale the signal to normalize Poissonian noise using
            the approach described in [KeenanKotula2004]_.
        algorithm : {'SVD', 'PCA', 'ORPCA', 'ORNMF'}, default 'SVD'
            The decomposition algorithm to use.
        output_dimension : int or None, default None
            Number of components to keep/calculate. If None, keep all
            (only valid for 'SVD' algorithm)
        get : dask scheduler
            the dask scheduler to use for computations;
            default `dask.threaded.get`
        num_chunks : int or None, default None
            the number of dask chunks to pass to the decomposition model.
            More chunks require more memory, but should run faster. Will be
            increased to contain at least ``output_dimension`` signals.
        navigation_mask : {BaseSignal, numpy array, dask array}
            The navigation locations marked as True are not used in the
        signal_mask : {BaseSignal, numpy array, dask array}
            The signal locations marked as True are not used in the
        reproject : bool, default True
            Reproject data on the learnt components (factors) after learning.
        print_info : bool, default True
            If True, print information about the decomposition being performed.
            In the case of sklearn.decomposition objects, this includes the
            values of all arguments of the chosen sklearn algorithm.
            passed to the partial_fit/fit functions.

        .. [KeenanKotula2004] M. Keenan and P. Kotula, "Accounting for Poisson noise
            in the multivariate analysis of ToF-SIMS spectrum images", Surf.
            Interface Anal 36(3) (2004): 203-212.

        See Also
        * :py:meth:`~.learn.mva.MVA.decomposition` for non-lazy signals
        * :py:func:`dask.array.linalg.svd`
        * :py:class:`sklearn.decomposition.IncrementalPCA`
        * :py:class:`~.learn.rpca.ORPCA`
        * :py:class:`~.learn.ornmf.ORNMF`

        if kwargs.get("bounds", False):
                "The `bounds` keyword is deprecated and will be removed "
                "in v2.0. Since version > 1.3 this has no effect.",
            kwargs.pop("bounds", None)

        # Deprecate 'ONMF' for 'ORNMF'
        if algorithm == "ONMF":
                "The argument `algorithm='ONMF'` has been deprecated and will "
                "be removed in future. Please use `algorithm='ORNMF'` instead.",
            algorithm = "ORNMF"

        # Check algorithms requiring output_dimension
        algorithms_require_dimension = ["PCA", "ORPCA", "ORNMF"]
        if algorithm in algorithms_require_dimension and output_dimension is None:
            raise ValueError(
                "`output_dimension` must be specified for '{}'".format(

        explained_variance = None
        explained_variance_ratio = None

        _al_data = self._data_aligned_with_axes
        nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension]
        sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:]

        num_chunks = 1 if num_chunks is None else num_chunks
        blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)])
        nblocks = multiply([len(c) for c in nav_chunks])

        if output_dimension and blocksize / output_dimension < num_chunks:
            num_chunks = np.ceil(blocksize / output_dimension)

        blocksize *= num_chunks

        # Initialize return_info and print_info
        to_return = None
        to_print = [
            "Decomposition info:", "  normalize_poissonian_noise={}".format(
            "  algorithm={}".format(algorithm),
            "  output_dimension={}".format(output_dimension)

        # LEARN
        if algorithm == "PCA":
            if not import_sklearn.sklearn_installed:
                raise ImportError("algorithm='PCA' requires scikit-learn")

            obj = import_sklearn.sklearn.decomposition.IncrementalPCA(
            method = partial(obj.partial_fit, **kwargs)
            reproject = True
            to_print.extend(["scikit-learn estimator:", obj])

        elif algorithm == "ORPCA":
            from hyperspy.learn.rpca import ORPCA

            batch_size = kwargs.pop("batch_size", None)
            obj = ORPCA(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        elif algorithm == "ORNMF":
            from hyperspy.learn.ornmf import ORNMF

            batch_size = kwargs.pop("batch_size", None)
            obj = ORNMF(output_dimension, **kwargs)
            method = partial(obj.fit, batch_size=batch_size)

        elif algorithm != "SVD":
            raise ValueError("'algorithm' not recognised")

        original_data = self.data
            _logger.info("Performing decomposition analysis")

            if normalize_poissonian_noise:
                _logger.info("Scaling the data to normalize Poissonian noise")

                data = self._data_aligned_with_axes
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                nm = da.logical_not(
                             chunks=nav_chunks) if navigation_mask is None else
                    to_array(navigation_mask, chunks=nav_chunks))
                sm = da.logical_not(
                             chunks=sig_chunks) if signal_mask is None else
                    to_array(signal_mask, chunks=sig_chunks))
                ndim = self.axes_manager.navigation_dimension
                sdim = self.axes_manager.signal_dimension
                bH, aG = da.compute(
                    data.sum(axis=tuple(range(ndim, ndim + sdim))),
                bH = da.where(sm, bH, 1)
                aG = da.where(nm, aG, 1)

                raG = da.sqrt(aG)
                rbH = da.sqrt(bH)

                coeff = raG[(..., ) +
                            (None, ) * rbH.ndim] * rbH[(None, ) * raG.ndim +
                                                       (..., )]
                coeff = da.where(coeff == 0, 1, coeff)
                data = data / coeff
                self.data = data

            # LEARN
            if algorithm == "SVD":
                reproject = False
                from dask.array.linalg import svd

                    self._unfolded4decomposition = self.unfold()
                    # TODO: implement masking
                    if navigation_mask or signal_mask:
                        raise NotImplementedError(
                            "Masking is not yet implemented for lazy SVD")

                    U, S, V = svd(self.data)

                    if output_dimension is None:
                        min_shape = min(min(U.shape), min(V.shape))
                        min_shape = output_dimension

                    U = U[:, :min_shape]
                    S = S[:min_shape]
                    V = V[:min_shape]

                    factors = V.T
                    explained_variance = S**2 / self.data.shape[0]
                    loadings = U * S
                    if self._unfolded4decomposition is True:
                        self._unfolded4decomposition is False
                this_data = []
                    for chunk in progressbar(
                        if len(this_data) == num_chunks:
                            thedata = np.concatenate(this_data, axis=0)
                            this_data = []
                    if len(this_data):
                        thedata = np.concatenate(this_data, axis=0)
                except KeyboardInterrupt:

            if algorithm == "PCA":
                explained_variance = obj.explained_variance_
                explained_variance_ratio = obj.explained_variance_ratio_
                factors = obj.components_.T

            elif algorithm == "ORPCA":
                factors, loadings = obj.finish()
                loadings = loadings.T

            elif algorithm == "ORNMF":
                factors, loadings = obj.finish()
                loadings = loadings.T

            # REPROJECT
            if reproject:
                if algorithm == "PCA":
                    method = obj.transform

                    def post(a):
                        return np.concatenate(a, axis=0)

                elif algorithm == "ORPCA":
                    method = obj.project

                    def post(a):
                        return np.concatenate(a, axis=1).T

                elif algorithm == "ORNMF":
                    method = obj.project

                    def post(a):
                        return np.concatenate(a, axis=1).T

                _map = map(
                    lambda thing: method(thing),
                H = []
                    for thing in progressbar(_map,
                except KeyboardInterrupt:
                loadings = post(H)

            if explained_variance is not None and explained_variance_ratio is None:
                explained_variance_ratio = explained_variance / explained_variance.sum(

            # RESHUFFLE "blocked" LOADINGS
            ndim = self.axes_manager.navigation_dimension
            if algorithm != "SVD":  # Only needed for online algorithms
                    loadings = _reshuffle_mixed_blocks(loadings, ndim,
                                                       (output_dimension, ),
                except ValueError:
                    # In case the projection step was not finished, it's left
                    # as scrambled
            self.data = original_data

        target = self.learning_results
        target.decomposition_algorithm = algorithm
        target.output_dimension = output_dimension
        if algorithm != "SVD":
            target._object = obj
        target.factors = factors
        target.loadings = loadings
        target.explained_variance = explained_variance
        target.explained_variance_ratio = explained_variance_ratio

        # Rescale the results if the noise was normalized
        if normalize_poissonian_noise is True:
            target.factors = target.factors * rbH.ravel()[:, np.newaxis]
            target.loadings = target.loadings * raG.ravel()[:, np.newaxis]

        # Print details about the decomposition we just performed
        if print_info:
            print("\n".join([str(pr) for pr in to_print]))
コード例 #10
ファイル: lazy.py プロジェクト: pc494/hyperspy
    def _get_dask_chunks(self, axis=None, dtype=None):
        """Returns dask chunks.

            - Have at least one signal (or specified axis) in a single chunk,
              or as many as fit in memory

        axis : {int, string, None, axis, tuple}
            If axis is None (default), returns chunks for current data shape so
            that at least one signal is in the chunk. If an axis is specified,
            only that particular axis is guaranteed to be "not sliced".
        dtype : {string, np.dtype}
            The dtype of target chunks.

        Tuple of tuples, dask chunks
        dc = self.data
        dcshape = dc.shape
        for _axis in self.axes_manager._axes:
            if _axis.index_in_array < len(dcshape):
                _axis.size = int(dcshape[_axis.index_in_array])

        if axis is not None:
            need_axes = self.axes_manager[axis]
            if not np.iterable(need_axes):
                need_axes = [
            need_axes = self.axes_manager.signal_axes

        if dtype is None:
            dtype = dc.dtype
        elif not isinstance(dtype, np.dtype):
            dtype = np.dtype(dtype)
        typesize = max(dtype.itemsize, dc.dtype.itemsize)
        want_to_keep = multiply([ax.size for ax in need_axes]) * typesize

        # @mrocklin reccomends to have around 100MB chunks, so we do that:
        num_that_fit = int(100. * 2.**20 / want_to_keep)

        # want to have at least one "signal" per chunk
        if num_that_fit < 2:
            chunks = [tuple(1 for _ in range(i)) for i in dc.shape]
            for ax in need_axes:
                chunks[ax.index_in_array] = dc.shape[ax.index_in_array],
            return tuple(chunks)

        sizes = [
            ax.size for ax in self.axes_manager._axes if ax not in need_axes
        indices = [
            ax.index_in_array for ax in self.axes_manager._axes
            if ax not in need_axes

        while True:
            if multiply(sizes) <= num_that_fit:

            i = np.argmax(sizes)
            sizes[i] = np.floor(sizes[i] / 2)
        chunks = []
        ndim = len(dc.shape)
        for i in range(ndim):
            if i in indices:
                size = float(dc.shape[i])
                split_array = np.array_split(
                    np.arange(size), np.ceil(size / sizes[indices.index(i)]))
                chunks.append(tuple(len(sp) for sp in split_array))
                chunks.append((dc.shape[i], ))
        return tuple(chunks)