def load_only_data(filename, array_shape, record_by, num_axes, data=None, header=None, only_valid_data=False): if data is None: header, data = load_ser_file(filename) # If the acquisition stops before finishing the job, the stored file will # report the requested size even though no values are recorded. Therefore # if the shapes of the retrieved array does not match that of the data # dimensions we must fill the rest with zeros or (better) nans if the # dtype is float if multiply(array_shape) != multiply(data['Array'].shape): if int(header['NumberDimensions']) == 1 and only_valid_data: # No need to fill with zeros if `TotalNumberElements != # ValidNumberElements` for series data. # The valid data is always `0:ValidNumberElements` dc = data['Array'][0:header['ValidNumberElements'][0], ...] array_shape[0] = header['ValidNumberElements'][0] else: # Maps will need to be filled with zeros or nans dc = np.zeros(multiply(array_shape), dtype=data['Array'].dtype) if dc.dtype is np.dtype('f') or dc.dtype is np.dtype('f8'): dc[:] = np.nan dc[:data['Array'].ravel().shape[0]] = data['Array'].ravel() else: dc = data['Array'] dc = dc.reshape(array_shape) if record_by == 'image': dc = dc[..., ::-1, :] if num_axes != len(dc.shape): dc = dc.squeeze() if num_axes != len(dc.shape): raise IOError("Please report this issue to the HyperSpy developers.") return dc
def load_only_data(filename, array_shape, record_by, num_axes, data=None): if data is None: _, data = load_ser_file(filename) # If the acquisition stops before finishing the job, the stored file will # report the requested size even though no values are recorded. Therefore # if the shapes of the retrieved array does not match that of the data # dimensions we must fill the rest with zeros or (better) nans if the # dtype is float if multiply(array_shape) != multiply(data['Array'].shape): dc = np.zeros(multiply(array_shape), dtype=data['Array'].dtype) if dc.dtype is np.dtype('f') or dc.dtype is np.dtype('f8'): dc[:] = np.nan dc[:data['Array'].ravel().shape[0]] = data['Array'].ravel() else: dc = data['Array'] dc = dc.reshape(array_shape) if record_by == 'image': dc = dc[..., ::-1, :] if num_axes != len(dc.shape): dc = dc.squeeze() if num_axes != len(dc.shape): raise IOError("Please report this issue to the HyperSpy developers.") return dc
def get_signal_chunks(shape, dtype, signal_axes=None): """Function that claculates chunks for the signal, preferably at least one chunk per signal space. Parameters ---------- shape : tuple the shape of the dataset to be sored / chunked dtype : {dtype, string} the numpy dtype of the data signal_axes: {None, iterable of ints} the axes defining "signal space" of the dataset. If None, the default h5py chunking is performed. """ typesize = np.dtype(dtype).itemsize if signal_axes is None: return h5py._hl.filters.guess_chunk(shape, None, typesize) # largely based on the guess_chunk in h5py CHUNK_MAX = 1024 * 1024 want_to_keep = multiply([shape[i] for i in signal_axes]) * typesize if want_to_keep >= CHUNK_MAX: chunks = [1 for _ in shape] for i in signal_axes: chunks[i] = shape[i] return tuple(chunks) chunks = [i for i in shape] idx = 0 navigation_axes = tuple(i for i in range(len(shape)) if i not in signal_axes) nchange = len(navigation_axes) while True: chunk_bytes = multiply(chunks) * typesize if chunk_bytes < CHUNK_MAX: break if multiply([chunks[i] for i in navigation_axes]) == 1: break change = navigation_axes[idx % nchange] chunks[change] = np.ceil(chunks[change] / 2.0) idx += 1 return tuple(int(x) for x in chunks)
def get_signal_chunks(shape, dtype, signal_axes=None, target_size=1e6): """ Function that calculates chunks for the signal, preferably at least one chunk per signal space. Parameters ---------- shape : tuple The shape of the dataset to be stored / chunked. dtype : {dtype, string} The numpy dtype of the data. signal_axes : {None, iterable of ints} The axes defining "signal space" of the dataset. If None, the default h5py chunking is performed. target_size : int The target number of bytes for one chunk """ typesize = np.dtype(dtype).itemsize if signal_axes is None: return h5py._hl.filters.guess_chunk(shape, None, typesize) # largely based on the guess_chunk in h5py bytes_per_signal = multiply([shape[i] for i in signal_axes]) * typesize signals_per_chunk = int(np.floor_divide(target_size, bytes_per_signal)) navigation_axes = tuple(i for i in range(len(shape)) if i not in signal_axes) num_nav_axes = len(navigation_axes) num_signals = np.prod([shape[i] for i in navigation_axes]) if signals_per_chunk < 2 or num_nav_axes==0: # signal is larger than chunk max chunks = [s if i in signal_axes else 1 for i, s in enumerate(shape)] return tuple(chunks) elif signals_per_chunk > num_signals: return shape else: # signal is smaller than chunk max # Index of axes with size smaller than required to make all chunks equal small_idx = [] # Sizes of axes with size smaller than required to make all chunks equal small_sizes = [] iterate = True while iterate: iterate = False # Calculate the size of the chunks of the axes not in `small_idx` # The process is iterative because `nav_axes_chunks` can be bigger # than some axes sizes. If that is the case, the value must be # recomputed at the next iteration after having added the "offending" # axes to `small_idx` nav_axes_chunks = int(np.floor((signals_per_chunk / np.prod(small_sizes))**(1 / (num_nav_axes - len(small_sizes))))) for index, size in enumerate(shape): if index not in (list(signal_axes) + small_idx) and size < nav_axes_chunks: small_idx.append(index) small_sizes.append(size) iterate = True chunks = [s if i in signal_axes or i in small_idx else nav_axes_chunks for i, s in enumerate(shape)] return tuple(int(x) for x in chunks)
def get_signal_chunks(shape, dtype, signal_axes=None, target_size=1e6): """ Function that calculates chunks for the signal, preferably at least one chunk per signal space. Parameters ---------- shape : tuple The shape of the dataset to be stored / chunked. dtype : {dtype, string} The numpy dtype of the data. signal_axes : {None, iterable of ints} The axes defining "signal space" of the dataset. If None, the default h5py chunking is performed. target_size : int The target number of bytes for one chunk """ typesize = np.dtype(dtype).itemsize if signal_axes is None: return h5py._hl.filters.guess_chunk(shape, None, typesize) # largely based on the guess_chunk in h5py bytes_per_signal = multiply([shape[i] for i in signal_axes]) * typesize signals_per_chunk = np.floor_divide(target_size, bytes_per_signal) navigation_axes = tuple(i for i in range(len(shape)) if i not in signal_axes) num_nav_axes = len(navigation_axes) num_signals = np.prod([shape[i] for i in navigation_axes]) if signals_per_chunk < 2 or num_nav_axes == 0: # signal is larger than chunk max chunks = [s if i in signal_axes else 1 for i, s in enumerate(shape)] return tuple(chunks) elif signals_per_chunk > num_signals: return shape else: # signal is smaller than chunk max sig_axes_chunk = np.floor(signals_per_chunk**(1 / num_nav_axes)) remainder = np.floor_divide( signals_per_chunk - (sig_axes_chunk**num_nav_axes), sig_axes_chunk) if remainder < 0: remainder = 0 chunks = [ s if i in signal_axes else sig_axes_chunk for i, s in enumerate(shape) ] chunks[navigation_axes[0]] = chunks[navigation_axes[0]] + remainder return tuple(int(x) for x in chunks)
def _reshuffle_mixed_blocks(array, ndim, sshape, nav_chunks): """Reshuffles dask block-shuffled array Parameters ---------- array : np.ndarray the array to reshuffle ndim : int the number of navigation (shuffled) dimensions sshape : tuple of ints The shape """ splits = np.cumsum([multiply(ar) for ar in product(*nav_chunks)][:-1]).tolist() if splits: all_chunks = [ ar.reshape(shape + sshape) for shape, ar in zip(product(*nav_chunks), np.split(array, splits)) ] def split_stack_list(what, step, axis): total = len(what) if total != step: return [ np.concatenate(what[i:i + step], axis=axis) for i in range(0, total, step) ] else: return np.concatenate(what, axis=axis) for chunks, axis in zip(nav_chunks[::-1], range(ndim - 1, -1, -1)): step = len(chunks) all_chunks = split_stack_list(all_chunks, step, axis) return all_chunks else: return array
def decomposition(self, normalize_poissonian_noise=False, algorithm="SVD", output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, print_info=True, **kwargs): """Perform Incremental (Batch) decomposition on the data. The results are stored in ``self.learning_results``. Read more in the :ref:`User Guide <big_data.decomposition>`. Parameters ---------- normalize_poissonian_noise : bool, default False If True, scale the signal to normalize Poissonian noise using the approach described in [KeenanKotula2004]_. algorithm : {'SVD', 'PCA', 'ORPCA', 'ORNMF'}, default 'SVD' The decomposition algorithm to use. output_dimension : int or None, default None Number of components to keep/calculate. If None, keep all (only valid for 'SVD' algorithm) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int or None, default None the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain at least ``output_dimension`` signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decomposition. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool, default True Reproject data on the learnt components (factors) after learning. print_info : bool, default True If True, print information about the decomposition being performed. In the case of sklearn.decomposition objects, this includes the values of all arguments of the chosen sklearn algorithm. **kwargs passed to the partial_fit/fit functions. References ---------- .. [KeenanKotula2004] M. Keenan and P. Kotula, "Accounting for Poisson noise in the multivariate analysis of ToF-SIMS spectrum images", Surf. Interface Anal 36(3) (2004): 203-212. See Also -------- * :py:meth:`~.learn.mva.MVA.decomposition` for non-lazy signals * :py:func:`dask.array.linalg.svd` * :py:class:`sklearn.decomposition.IncrementalPCA` * :py:class:`~.learn.rpca.ORPCA` * :py:class:`~.learn.ornmf.ORNMF` """ if kwargs.get("bounds", False): warnings.warn( "The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.", VisibleDeprecationWarning, ) kwargs.pop("bounds", None) # Deprecate 'ONMF' for 'ORNMF' if algorithm == "ONMF": warnings.warn( "The argument `algorithm='ONMF'` has been deprecated and will " "be removed in future. Please use `algorithm='ORNMF'` instead.", VisibleDeprecationWarning, ) algorithm = "ORNMF" # Check algorithms requiring output_dimension algorithms_require_dimension = ["PCA", "ORPCA", "ORNMF"] if algorithm in algorithms_require_dimension and output_dimension is None: raise ValueError( "`output_dimension` must be specified for '{}'".format( algorithm)) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # Initialize return_info and print_info to_return = None to_print = [ "Decomposition info:", " normalize_poissonian_noise={}".format( normalize_poissonian_noise), " algorithm={}".format(algorithm), " output_dimension={}".format(output_dimension) ] # LEARN if algorithm == "PCA": if not import_sklearn.sklearn_installed: raise ImportError("algorithm='PCA' requires scikit-learn") obj = import_sklearn.sklearn.decomposition.IncrementalPCA( n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True to_print.extend(["scikit-learn estimator:", obj]) elif algorithm == "ORPCA": from hyperspy.learn.rpca import ORPCA batch_size = kwargs.pop("batch_size", None) obj = ORPCA(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm == "ORNMF": from hyperspy.learn.ornmf import ORNMF batch_size = kwargs.pop("batch_size", None) obj = ORNMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "SVD": raise ValueError("'algorithm' not recognised") original_data = self.data try: _logger.info("Performing decomposition analysis") if normalize_poissonian_noise: _logger.info("Scaling the data to normalize Poissonian noise") data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros(self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array(navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros(self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array(signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=tuple(range(ndim))), data.sum(axis=tuple(range(ndim, ndim + sdim))), ) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] * rbH[(None, ) * raG.ndim + (..., )] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "SVD": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplementedError( "Masking is not yet implemented for lazy SVD") U, S, V = svd(self.data) if output_dimension is None: min_shape = min(min(U.shape), min(V.shape)) else: min_shape = output_dimension U = U[:, :min_shape] S = S[:min_shape] V = V[:min_shape] factors = V.T explained_variance = S**2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask, ), total=nblocks, leave=True, desc="Learn", ): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == "PCA": explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == "ORPCA": factors, loadings = obj.finish() loadings = loadings.T elif algorithm == "ORNMF": factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == "PCA": method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == "ORPCA": method = obj.project def post(a): return np.concatenate(a, axis=1).T elif algorithm == "ORNMF": method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map( lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask, ), ) H = [] try: for thing in progressbar(_map, total=nblocks, desc="Project"): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and explained_variance_ratio is None: explained_variance_ratio = explained_variance / explained_variance.sum( ) # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "SVD": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks(loadings, ndim, (output_dimension, ), nav_chunks).reshape( (-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "SVD": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis] # Print details about the decomposition we just performed if print_info: print("\n".join([str(pr) for pr in to_print]))
def _get_dask_chunks(self, axis=None, dtype=None): """Returns dask chunks. Aims: - Have at least one signal (or specified axis) in a single chunk, or as many as fit in memory Parameters ---------- axis : {int, string, None, axis, tuple} If axis is None (default), returns chunks for current data shape so that at least one signal is in the chunk. If an axis is specified, only that particular axis is guaranteed to be "not sliced". dtype : {string, np.dtype} The dtype of target chunks. Returns ------- Tuple of tuples, dask chunks """ dc = self.data dcshape = dc.shape for _axis in self.axes_manager._axes: if _axis.index_in_array < len(dcshape): _axis.size = int(dcshape[_axis.index_in_array]) if axis is not None: need_axes = self.axes_manager[axis] if not np.iterable(need_axes): need_axes = [ need_axes, ] else: need_axes = self.axes_manager.signal_axes if dtype is None: dtype = dc.dtype elif not isinstance(dtype, np.dtype): dtype = np.dtype(dtype) typesize = max(dtype.itemsize, dc.dtype.itemsize) want_to_keep = multiply([ax.size for ax in need_axes]) * typesize # @mrocklin reccomends to have around 100MB chunks, so we do that: num_that_fit = int(100. * 2.**20 / want_to_keep) # want to have at least one "signal" per chunk if num_that_fit < 2: chunks = [tuple(1 for _ in range(i)) for i in dc.shape] for ax in need_axes: chunks[ax.index_in_array] = dc.shape[ax.index_in_array], return tuple(chunks) sizes = [ ax.size for ax in self.axes_manager._axes if ax not in need_axes ] indices = [ ax.index_in_array for ax in self.axes_manager._axes if ax not in need_axes ] while True: if multiply(sizes) <= num_that_fit: break i = np.argmax(sizes) sizes[i] = np.floor(sizes[i] / 2) chunks = [] ndim = len(dc.shape) for i in range(ndim): if i in indices: size = float(dc.shape[i]) split_array = np.array_split( np.arange(size), np.ceil(size / sizes[indices.index(i)])) chunks.append(tuple(len(sp) for sp in split_array)) else: chunks.append((dc.shape[i], )) return tuple(chunks)