def put_fake_dataset(store, prefix, shape, chunk_overrides=None): """Write a fake dataset into the chunk store.""" data = { 'correlator_data': ramp(shape, dtype=np.float32) * (1 - 1j), 'flags': np.ones(shape, dtype=np.uint8), 'weights': ramp(shape, slope=255. / np.prod(shape), dtype=np.uint8), 'weights_channel': ramp(shape[:-1], dtype=np.float32) } if chunk_overrides is None: chunk_overrides = {} ddata = { k: to_dask_array(array, chunk_overrides.get(k)) for k, array in data.items() } chunk_info = { k: { 'prefix': prefix, 'chunks': darray.chunks, 'dtype': darray.dtype, 'shape': darray.shape } for k, darray in ddata.items() } push = [ store.put_dask_array(store.join(prefix, k), darray) for k, darray in ddata.items() ] da.compute(*push) return data, chunk_info
def range(cls, dataset, dimension): if dataset._binned and dimension in dataset.kdims: expanded = cls.irregular(dataset, dimension) column = cls.coords(dataset, dimension, expanded=expanded, edges=True) else: column = cls.values(dataset, dimension, expanded=False, flat=False) if column.dtype.kind == 'M': dmin, dmax = column.min(), column.max() if da and isinstance(column, da.Array): return da.compute(dmin, dmax) return dmin, dmax elif len(column) == 0: return np.NaN, np.NaN else: try: dmin, dmax = (np.nanmin(column), np.nanmax(column)) if da and isinstance(column, da.Array): return da.compute(dmin, dmax) return dmin, dmax except TypeError: column.sort() return column[0], column[-1]
def put_fake_dataset(store, prefix, shape, chunk_overrides=None, array_overrides=None, flags_only=False): """Write a fake dataset into the chunk store.""" if flags_only: data = {'flags': np.random.RandomState(1).randint(0, 7, shape, dtype=np.uint8)} else: data = {'correlator_data': ramp(shape, dtype=np.float32) * (1 - 1j), 'flags': np.random.RandomState(2).randint(0, 7, shape, dtype=np.uint8), 'weights': ramp(shape, slope=255. / np.prod(shape), dtype=np.uint8), 'weights_channel': ramp(shape[:-1], dtype=np.float32)} if array_overrides is not None: for name in data: if name in array_overrides: data[name] = array_overrides[name] if chunk_overrides is None: chunk_overrides = {} ddata = {k: to_dask_array(array, chunk_overrides.get(k)) for k, array in data.items()} chunk_info = {k: {'prefix': prefix, 'chunks': darray.chunks, 'dtype': np.lib.format.dtype_to_descr(darray.dtype), 'shape': darray.shape} for k, darray in ddata.items()} for k, darray in ddata.items(): store.create_array(store.join(prefix, k)) push = [store.put_dask_array(store.join(prefix, k), darray) for k, darray in ddata.items()] da.compute(*push) return data, chunk_info
def test_multireadwrite(ms, group_cols, index_cols): xds = xds_from_ms(ms, group_cols=group_cols, index_cols=index_cols) nds = [ds.copy() for ds in xds] writes = [xds_to_table(sds, ms, sds.data_vars.keys()) for sds in nds] da.compute(writes)
def test_get_angles_satpos_preference(self, forced_preference): """Test that 'actual' satellite position is used for generating sensor angles.""" from satpy.modifiers.angles import get_angles input_data1 = _get_angle_test_data() # add additional satellite position metadata input_data1.attrs["orbital_parameters"]["nadir_longitude"] = 9.0 input_data1.attrs["orbital_parameters"]["nadir_latitude"] = 0.01 input_data1.attrs["orbital_parameters"][ "satellite_actual_longitude"] = 9.5 input_data1.attrs["orbital_parameters"][ "satellite_actual_latitude"] = 0.005 input_data1.attrs["orbital_parameters"][ "satellite_actual_altitude"] = 12345679 input_data2 = input_data1.copy(deep=True) input_data2.attrs = deepcopy(input_data1.attrs) input_data2.attrs["orbital_parameters"]["nadir_longitude"] = 9.1 input_data2.attrs["orbital_parameters"]["nadir_latitude"] = 0.02 input_data2.attrs["orbital_parameters"][ "satellite_actual_longitude"] = 9.5 input_data2.attrs["orbital_parameters"][ "satellite_actual_latitude"] = 0.005 input_data2.attrs["orbital_parameters"][ "satellite_actual_altitude"] = 12345679 from pyorbital.orbital import get_observer_look with mock.patch("satpy.modifiers.angles.get_observer_look", wraps=get_observer_look) as gol, \ satpy.config.set(sensor_angles_position_preference=forced_preference): angles1 = get_angles(input_data1) da.compute(angles1) angles2 = get_angles(input_data2) da.compute(angles2) # get_observer_look should have been called once per array chunk assert gol.call_count == input_data1.data.blocks.size * 2 if forced_preference == "actual": exp_call = mock.call(9.5, 0.005, 12345.679, input_data1.attrs["start_time"], mock.ANY, mock.ANY, 0) all_same_calls = [exp_call] * gol.call_count gol.assert_has_calls(all_same_calls) # the dask arrays should have the same name to prove they are the same computation for angle_arr1, angle_arr2 in zip(angles1, angles2): assert angle_arr1.data.name == angle_arr2.data.name else: # nadir 1 gol.assert_any_call(9.0, 0.01, 12345.679, input_data1.attrs["start_time"], mock.ANY, mock.ANY, 0) # nadir 2 gol.assert_any_call(9.1, 0.02, 12345.679, input_data1.attrs["start_time"], mock.ANY, mock.ANY, 0)
def test_save_future(comm): cosmo = cosmology.Planck15 import tempfile import shutil tmpfile = tempfile.mkdtemp() data = numpy.ones(100, dtype=[('Position', ('f4', 3)), ('Velocity', ('f4', 3)), ('Mass', ('f4'))]) data['Mass'] = numpy.arange(len(data)) data['Position'] = numpy.arange(len(data) * 3).reshape( data['Position'].shape) data['Velocity'] = numpy.arange(len(data) * 3).reshape( data['Velocity'].shape) import dask.array as da source = ArrayCatalog(data, BoxSize=100, Nmesh=32, comm=comm) source['Rogue'] = da.ones((3, len(data)), chunks=(1, 1)).T # add a non-array attrs (saved as JSON) source.attrs['empty'] = None # save to a BigFile d = source.save(tmpfile, dataset='1', compute=False) # load as a BigFileCatalog; only attributes are saved source2 = BigFileCatalog(tmpfile, dataset='1', comm=comm) # check sources for k in source.attrs: assert_array_equal(source2.attrs[k], source.attrs[k]) da.compute(d) # reload as a BigFileCatalog, data is saved source2 = BigFileCatalog(tmpfile, dataset='1', comm=comm) # check the data def allconcat(data): return numpy.concatenate(comm.allgather(data), axis=0) assert_allclose(allconcat(source['Position']), allconcat(source2['Position'])) assert_allclose(allconcat(source['Velocity']), allconcat(source2['Velocity'])) assert_allclose(allconcat(source['Mass']), allconcat(source2['Mass']))
def _cache_results(self, res, zarr_format): os.makedirs(os.path.dirname(zarr_format), exist_ok=True) new_res = [] for idx, sub_res in enumerate(res): if not isinstance(sub_res, da.Array): raise ValueError("Zarr caching currently only supports dask " f"arrays. Got {type(sub_res)}") zarr_path = zarr_format.format(idx) # See https://github.com/dask/dask/issues/8380 with dask.config.set({"optimization.fuse.active": False}): new_sub_res = sub_res.to_zarr(zarr_path, compute=False) new_res.append(new_sub_res) # actually compute the storage to zarr da.compute(new_res)
def test_save_future(comm): cosmo = cosmology.Planck15 import tempfile import shutil tmpfile = tempfile.mkdtemp() data = numpy.ones(100, dtype=[ ('Position', ('f4', 3)), ('Velocity', ('f4', 3)), ('Mass', ('f4'))] ) data['Mass'] = numpy.arange(len(data)) data['Position'] = numpy.arange(len(data) * 3).reshape(data['Position'].shape) data['Velocity'] = numpy.arange(len(data) * 3).reshape(data['Velocity'].shape) import dask.array as da source = ArrayCatalog(data, BoxSize=100, Nmesh=32, comm=comm) source['Rogue'] = da.ones((3, len(data)), chunks=(1, 1)).T # add a non-array attrs (saved as JSON) source.attrs['empty'] = None # save to a BigFile d = source.save(tmpfile, dataset='1', compute=False) # load as a BigFileCatalog; only attributes are saved source2 = BigFileCatalog(tmpfile, dataset='1', comm=comm) # check sources for k in source.attrs: assert_array_equal(source2.attrs[k], source.attrs[k]) da.compute(d) # reload as a BigFileCatalog, data is saved source2 = BigFileCatalog(tmpfile, dataset='1', comm=comm) # check the data def allconcat(data): return numpy.concatenate(comm.allgather(data), axis=0) assert_allclose(allconcat(source['Position']), allconcat(source2['Position'])) assert_allclose(allconcat(source['Velocity']), allconcat(source2['Velocity'])) assert_allclose(allconcat(source['Mass']), allconcat(source2['Mass']))
def _concat_zarrs_optimized( zarr_files: List[str], output: PathType, vars_to_rechunk: List[Hashable], vars_to_copy: List[Hashable], ) -> None: zarr_groups = [zarr.open_group(f) for f in zarr_files] first_zarr_group = zarr_groups[0] # create the top-level group zarr.open_group(str(output), mode="w") # copy variables that are to be rechunked # NOTE: that this uses _to_zarr function defined here that is needed to avoid # race conditions between writing the array contents and its metadata # see https://github.com/pystatgen/sgkit/pull/486 delayed = [] # do all the rechunking operations in one computation for var in vars_to_rechunk: dtype = None if var in {"variant_id", "variant_allele"}: max_len = _get_max_len(zarr_groups, f"max_length_{var}") dtype = f"S{max_len}" arr = concatenate_and_rechunk([group[var] for group in zarr_groups], dtype=dtype) d = _to_zarr( # type: ignore[no-untyped-call] arr, str(output), component=var, overwrite=True, compute=False, fill_value=None, attrs=first_zarr_group[var].attrs.asdict(), ) delayed.append(d) da.compute(*delayed) # copy unchanged variables and top-level metadata with zarr.open_group(str(output)) as output_zarr: # copy variables that are not rechunked (e.g. sample_id) for var in vars_to_copy: output_zarr[var] = first_zarr_group[var] output_zarr[var].attrs.update(first_zarr_group[var].attrs) # copy top-level attributes output_zarr.attrs.update(first_zarr_group.attrs)
def _co_realise_lazy_arrays(arrays): """ Compute multiple lazy arrays and return a list of real values. All the arrays are computed together, so they can share results for common graph elements. Casts all results with `np.asanyarray`, and converts any MaskedConstants appearing into masked arrays, to ensure that all return values are writeable NumPy array objects. Any non-lazy arrays are passed through, as they are by `da.compute`. They undergo the same result standardisation. """ computed_arrays = da.compute(*arrays) results = [] for lazy_in, real_out in zip(arrays, computed_arrays): # Ensure we always have arrays. # Note : in some cases dask (and numpy) will return a scalar # numpy.int/numpy.float object rather than an ndarray. # Recorded in https://github.com/dask/dask/issues/2111. real_out = np.asanyarray(real_out) if isinstance(real_out, ma.core.MaskedConstant): # Convert any masked constants into NumPy masked arrays. # NOTE: in this case, also apply the original lazy-array dtype, as # masked constants *always* have dtype float64. real_out = ma.masked_array(real_out.data, mask=real_out.mask, dtype=lazy_in.dtype) results.append(real_out) return results
def finite_range(column, cmin, cmax): try: min_inf = np.isinf(cmin) except TypeError: min_inf = False try: max_inf = np.isinf(cmax) except TypeError: max_inf = False if (min_inf or max_inf): column = column[np.isfinite(column)] if len(column): cmin = np.nanmin(column) if min_inf else cmin cmax = np.nanmmax(column) if max_inf else cmax if is_dask(column): import dask.array as da if min_inf and max_inf: cmin, cmax = da.compute(cmin, cmax) elif min_inf: cmin = cmin.compute() else: cmax = cmax.compute() else: return cmin, cmax if isinstance(cmin, np.ndarray) and cmin.shape == (): cmin = cmin[()] if isinstance(cmax, np.ndarray) and cmax.shape == (): cmax = cmax[()] cmin = cmin if np.isscalar(cmin) or isinstance( cmin, util.datetime_types) else cmin.item() cmax = cmax if np.isscalar(cmax) or isinstance( cmax, util.datetime_types) else cmax.item() return cmin, cmax
def test_get_bucket_indices(self): """Test calculation of array indices.""" # Ensure nothing is calculated with dask.config.set(scheduler=CustomScheduler(max_computes=0)): self.resampler._get_indices() x_idxs, y_idxs = da.compute(self.resampler.x_idxs, self.resampler.y_idxs) np.testing.assert_equal(x_idxs, np.array([1710, 1710, 1707, 1705])) np.testing.assert_equal(y_idxs, np.array([465, 465, 459, 455])) # Additional small test case adef = create_area_def(area_id='test', projection={'proj': 'latlong'}, width=2, height=2, center=(0, 0), resolution=10) lons = da.from_array(np.array( [-10.0, -9.9, -0.1, 0, 0.1, 9.9, 10.0, -10.1, 0]), chunks=2) lats = da.from_array(np.array( [-10.0, -9.9, -0.1, 0, 0.1, 9.9, 10.0, 0, 10.1]), chunks=2) resampler = bucket.BucketResampler(source_lats=lats, source_lons=lons, target_area=adef) resampler._get_indices() np.testing.assert_equal(resampler.x_idxs, np.array([-1, 0, 0, 1, 1, 1, -1, -1, -1])) np.testing.assert_equal(resampler.y_idxs, np.array([-1, 1, 1, 1, 0, 0, -1, -1, -1]))
def get_dataset_with_area_def(self, arr, dataset_id): """Get dataset with an AreaDefinition.""" if dataset_id['name'] in ['latitude', 'longitude']: self.__setattr__(dataset_id['name'], arr) xarr = xr.DataArray(arr, dims=["y"]) else: lons_1d, lats_1d, data_1d = da.compute(self.longitude, self.latitude, arr) self._area_def = self._construct_area_def(dataset_id) icol, irow = self._area_def.get_array_indices_from_lonlat( lons_1d, lats_1d) data_2d = np.empty(self._area_def.shape) data_2d[:] = np.nan data_2d[irow.compressed(), icol.compressed()] = data_1d[~irow.mask] xarr = xr.DataArray(da.from_array(data_2d, CHUNK_SIZE), dims=('y', 'x')) ntotal = len(icol) nvalid = len(icol.compressed()) if nvalid < ntotal: logging.warning( f'{ntotal-nvalid} out of {ntotal} data points could not be put on ' f'the grid {self._area_def.area_id}.') return xarr
def test_predict_proba(dataset, datatype, n_neighbors, n_parts, batch_size, client): X_train, X_test, y_train, y_test = dataset l_model = lKNNClf(n_neighbors=n_neighbors) l_model.fit(X_train, y_train) l_probas = l_model.predict_proba(X_test) X_train = generate_dask_array(X_train, n_parts) X_test = generate_dask_array(X_test, n_parts) y_train = generate_dask_array(y_train, n_parts) if datatype == 'dask_cudf': X_train = to_dask_cudf(X_train, client) X_test = to_dask_cudf(X_test, client) y_train = to_dask_cudf(y_train, client) d_model = dKNNClf(client=client, n_neighbors=n_neighbors) d_model.fit(X_train, y_train) d_probas = d_model.predict_proba(X_test, convert_dtype=True) d_probas = da.compute(d_probas)[0] if datatype == 'dask_cudf': d_probas = list(map(lambda o: o.as_matrix() if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis], d_probas)) check_probabilities(l_probas, d_probas)
def process(input_path, pedestal_path, output_path): reader = TIOReader(input_path) wf_calib = WaveformCalibrator( pedestal_path, reader.n_pixels, reader.n_samples ) wfs = get_da(reader, wf_calib) mean, std, mean_pix, std_pix, (hist, edges) = da.compute( wfs.mean(), wfs.std(), wfs.mean(axis=(0, 2)), wfs.std(axis=(0, 2)), da.histogram(wfs, bins=1000, range=(-10, 10)) ) np.savez( output_path, mean=mean, std=std, mean_pix=mean_pix, std_pix=std_pix, hist=hist, edges=edges )
def _create_resample_kdtree(self): """Set up kd tree on input""" # Get input information valid_input_index, source_lons, source_lats = \ _get_valid_input_index_dask(self.source_geo_def, self.target_geo_def, self.reduce_data, self.radius_of_influence, nprocs=self.nprocs) # FIXME: Is dask smart enough to only compute the pixels we end up # using even with this complicated indexing input_coords = lonlat2xyz(source_lons, source_lats) valid_input_index = da.ravel(valid_input_index) input_coords = input_coords[valid_input_index, :] input_coords = input_coords.compute() # Build kd-tree on input input_coords = input_coords.astype(np.float) valid_input_index, input_coords = da.compute(valid_input_index, input_coords) if kd_tree_name == 'pykdtree': resample_kdtree = KDTree(input_coords) else: resample_kdtree = sp.cKDTree(input_coords) return valid_input_index, resample_kdtree
def _box_stats(self, vals): is_finite = isfinite is_dask = is_dask_array(vals) is_cupy = is_cupy_array(vals) if is_cupy: import cupy percentile = cupy.percentile is_finite = cupy.isfinite elif is_dask: import dask.array as da percentile = da.percentile else: percentile = np.percentile vals = vals[is_finite(vals)] if len(vals): q1, q2, q3 = (percentile(vals, q=q) for q in range(25, 100, 25)) iqr = q3 - q1 upper = vals[vals <= q3 + 1.5 * iqr].max() lower = vals[vals >= q1 - 1.5 * iqr].min() else: q1, q2, q3 = 0, 0, 0 upper, lower = 0, 0 outliers = vals[(vals > upper) | (vals < lower)] if is_cupy: return (q1.item(), q2.item(), q3.item(), upper.item(), lower.item(), cupy.asnumpy(outliers)) elif is_dask: return da.compute(q1, q2, q3, upper, lower, outliers) else: return q1, q2, q3, upper, lower, outliers
def black_scholes(nopt, price, strike, t, rate, vol, schd=None): mr = -rate sig_sig_two = vol * vol * 2 P = price S = strike T = t a = log(P / S) b = T * mr z = T * sig_sig_two c = 0.25 * z y = da.map_blocks(invsqrt, z) w1 = (a - b + c) * y w2 = (a - b - c) * y d1 = 0.5 + 0.5 * da.map_blocks(erf, w1) d2 = 0.5 + 0.5 * da.map_blocks(erf, w2) Se = exp(b) * S call = P * d1 - Se * d2 put = call - P + Se return da.compute(da.stack((put, call)), get=schd)
def test_no_shared_keys_with_different_depths(): da.random.seed(0) a = da.random.random((9, 9), chunks=(3, 3)) def check(x): assert x.shape == (3, 3) return x r = [a.map_overlap(lambda a: a + 1, dtype=a.dtype, depth={j: int(i == j) for j in range(a.ndim)}, boundary="none").map_blocks(check, dtype=a.dtype) for i in range(a.ndim)] assert set(r[0].dask) & set(r[1].dask) == set(a.dask) da.compute(*r, scheduler='single-threaded')
def _co_realise_lazy_arrays(arrays): """ Compute multiple lazy arrays and return a list of real values. All the arrays are computed together, so they can share results for common graph elements. Casts all results with `np.asanyarray`, and converts any MaskedConstants appearing into masked arrays, to ensure that all return values are writeable NumPy array objects. Any non-lazy arrays are passed through, as they are by `da.compute`. They undergo the same result standardisation. """ computed_arrays = da.compute(*arrays) results = [] for lazy_in, real_out in zip(arrays, computed_arrays): # Ensure we always have arrays. # Note : in some cases dask (and numpy) will return a scalar # numpy.int/numpy.float object rather than an ndarray. # Recorded in https://github.com/dask/dask/issues/2111. real_out = np.asanyarray(real_out) if isinstance(real_out, ma.core.MaskedConstant): # Convert any masked constants into NumPy masked arrays. # NOTE: in this case, also apply the original lazy-array dtype, as # masked constants *always* have dtype float64. real_out = ma.masked_array( real_out.data, mask=real_out.mask, dtype=lazy_in.dtype ) results.append(real_out) return results
def find_start_end(data: Data, distributed: bool = False) -> (int, int): """ Find the shutter open and shutter close timestamps. Args: data: A LATRD data dictionary (a dictionary with data set names as keys and Dask arrays as values). Must contain one entry for cue id messages and one for cue timestamps. The two arrays are assumed to have the same length. distributed: Whether the computation uses the Dask distributed scheduler, in which case a progress bar will be displayed. Returns: The shutter open and shutter close timestamps, in clock cycles. """ start_time = first_cue_time(data, shutter_open) end_time = first_cue_time(data, shutter_close) # If we are using the distributed scheduler (for multiple images), show progress. if distributed: print("Finding detector shutter open and close times.") progress(start_time.persist(), end_time.persist()) start_time, end_time = da.compute(start_time, end_time) print() return start_time, end_time
def test_svd_supported_array_shapes(chunks, shape): x = np.random.random(shape) dx = da.from_array(x, chunks=chunks) def svd_flip(u, v): """Sign correction to ensure deterministic output from SVD. See: - https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/utils/extmath.py#L504 - https://github.com/dask/dask/issues/6599 """ max_abs_cols = np.argmax(np.abs(u), axis=0) signs = np.sign(u[max_abs_cols, range(u.shape[1])]) u *= signs v *= signs[:, np.newaxis] return u, v du, ds, dv = da.linalg.svd(dx) du, dv = da.compute(du, dv) # Workaround for no `full_matrices=False` # https://github.com/dask/dask/issues/3576 k = min(du.shape + dv.shape) du, dv = du[:, :k], dv[:k, :] nu, ns, nv = np.linalg.svd(x) # Correct signs before comparison du, dv = svd_flip(du, dv) nu, nv = svd_flip(du, dv) assert_eq(du, nu) assert_eq(ds, ns) assert_eq(dv, nv)
def test_svd_compressed_deterministic(): m, n = 30, 25 x = da.random.RandomState(1234).random_sample(size=(m, n), chunks=(5, 5)) u, s, vt = svd_compressed(x, 3, seed=1234) u2, s2, vt2 = svd_compressed(x, 3, seed=1234) assert all(da.compute((u == u2).all(), (s == s2).all(), (vt == vt2).all()))
def test_data_with_area_definition(self, input_file): """Test data loaded with AreaDefinition.""" bufr_obj = SeviriL2BufrData(input_file, with_adef=True) _ = bufr_obj.get_data( DATASET_INFO_LAT) # We need to load the lat/lon data in order to _ = bufr_obj.get_data( DATASET_INFO_LON) # populate the file handler with these data z = bufr_obj.get_data(DATASET_INFO) ad = bufr_obj.fh.get_area_def(None) assert ad == AREA_DEF data_1d = np.concatenate((DATA, DATA), axis=0) # Put BUFR data on 2D grid that the 2D array returned by get_dataset should correspond to lons_1d, lats_1d = da.compute(bufr_obj.fh.longitude, bufr_obj.fh.latitude) icol, irow = ad.get_array_indices_from_lonlat(lons_1d, lats_1d) data_2d = np.empty(ad.shape) data_2d[:] = np.nan data_2d[irow.compressed(), icol.compressed()] = data_1d[~irow.mask] np.testing.assert_array_equal(z.values, data_2d) # Test that the correct AreaDefinition is identified for products with 3 pixel segements bufr_obj.fh.seg_size = 3 ad_ext = bufr_obj.fh._construct_area_def( make_dataid(name='dummmy', resolution=9000)) assert ad_ext == AREA_DEF_EXT
def compute(self, **kwargs): """ Calls dask compute on the dask arrays in this Dataset, returning a new Dataset. Returns ------- :class:`~daskms.dataset.Dataset` Dataset containing computed arrays. """ # Compute dask arrays separately dask_data = {} data_vars = {} # Split variables into dask and other data for k, v in self._data_vars.items(): if isinstance(v.data, da.Array): dask_data[k] = v else: data_vars[k] = v # Compute dask arrays if present and add them to data variables if len(dask_data) > 0: data_vars.update(da.compute(dask_data, **kwargs)[0]) return Dataset(data_vars, coords=self._coords, attrs=self._attrs.copy())
def materialize_as_ndarray(a): """Convert distributed arrays to ndarrays.""" if type(a) in (list, tuple): if da is not None and any(isinstance(arr, da.Array) for arr in a): return da.compute(*a, sync=True) return tuple(np.asarray(arr) for arr in a) return np.asarray(a)
def test_svd_compressed(): m, n = 2000, 250 r = 10 np.random.seed(4321) mat1 = np.random.randn(m, r) mat2 = np.random.randn(r, n) mat = mat1.dot(mat2) data = da.from_array(mat, chunks=(500, 50)) u, s, vt = svd_compressed(data, r, seed=4321, n_power_iter=2) u, s, vt = da.compute(u, s, vt) usvt = np.dot(u, np.dot(np.diag(s), vt)) tol = 0.2 assert_eq(np.linalg.norm(mat - usvt), np.linalg.norm(mat), rtol=tol, atol=tol) # average accuracy check u = u[:, :r] s = s[:r] vt = vt[:r, :] s_exact = np.linalg.svd(mat)[1] s_exact = s_exact[:r] assert_eq(np.eye(r, r), np.dot(u.T, u)) # u must be orthonormal assert_eq(np.eye(r, r), np.dot(vt, vt.T)) # v must be orthonormal assert_eq(s, s_exact) # s must contain the singular values
def _estimate_gaussian_parameters(signal, x1, x2, only_current): axis = signal.axes_manager.signal_axes[0] i1, i2 = axis.value_range_to_indices(x1, x2) X = axis.axis[i1:i2] if only_current is True: data = signal()[i1:i2] X_shape = (len(X),) i = 0 centre_shape = (1,) else: i = axis.index_in_array data_gi = [slice(None), ] * len(signal.data.shape) data_gi[axis.index_in_array] = slice(i1, i2) data = signal.data[tuple(data_gi)] X_shape = [1, ] * len(signal.data.shape) X_shape[axis.index_in_array] = data.shape[i] centre_shape = list(data.shape) centre_shape[i] = 1 centre = np.sum(X.reshape(X_shape) * data, i) / np.sum(data, i) sigma = np.sqrt(np.abs(np.sum((X.reshape(X_shape) - centre.reshape( centre_shape)) ** 2 * data, i) / np.sum(data, i))) height = data.max(i) if isinstance(data, da.Array): return da.compute(centre, height, sigma) else: return centre, height, sigma
def test_make_regression(n_samples, n_features, n_informative, n_targets, bias, effective_rank, tail_strength, noise, shuffle, coef, random_state, n_parts, cluster): c = Client(cluster) try: from cuml.dask.datasets import make_regression result = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=n_targets, bias=bias, effective_rank=effective_rank, noise=noise, shuffle=shuffle, coef=coef, random_state=random_state, n_parts=n_parts) if coef: out, values, coefs = result else: out, values = result assert out.shape == (n_samples, n_features), "out shape mismatch" if n_targets > 1: assert values.shape == (n_samples, n_targets), \ "values shape mismatch" else: assert values.shape == (n_samples, ), "values shape mismatch" assert len(out.chunks[0]) == n_parts assert len(out.chunks[1]) == 1 if coef: if n_targets > 1: assert coefs.shape == (n_features, n_targets), \ "coefs shape mismatch" assert len(coefs.chunks[1]) == 1 else: assert coefs.shape == (n_features, ), "coefs shape mismatch" assert len(coefs.chunks[0]) == 1 test1 = da.all(da.sum(coefs != 0.0, axis=0) == n_informative) std_test2 = da.std(values - (da.dot(out, coefs) + bias), axis=0) test1, std_test2 = da.compute(test1, std_test2) diff = cp.abs(1.0 - std_test2) test2 = cp.all(diff < 1.5 * 10**(-1.)) assert test1, \ "Unexpected number of informative features" assert test2, "Unexpectedly incongruent outputs" finally: c.close()
def pearson_1xn( x: da.Array, data: da.Array, value_range: Optional[Tuple[float, float]] = None, k: Optional[int] = None, ) -> Tuple[np.ndarray, np.ndarray]: """ Parameters ---------- x : da.Array data : da.Array value_range : Optional[Tuple[float, float]] = None k : Optional[int] = None """ _, ncols = data.shape corrs = [] for j in range(ncols): mask = ~(da.isnan(x) | da.isnan(data[:, j])) _, (corr, _) = da.corrcoef(np.array(x)[mask], np.array(data[:, j])[mask]) corrs.append(corr) (corrs, ) = da.compute(corrs) corrs = np.asarray(corrs) return corr_filter(corrs, value_range, k)
def _estimate_lorentzian_parameters(signal, x1, x2, only_current): axis = signal.axes_manager.signal_axes[0] i1, i2 = axis.value_range_to_indices(x1, x2) X = axis.axis[i1:i2] if only_current is True: data = signal()[i1:i2] i = 0 centre_shape = (1,) else: i = axis.index_in_array data_gi = [slice(None), ] * len(signal.data.shape) data_gi[axis.index_in_array] = slice(i1, i2) data = signal.data[tuple(data_gi)] centre_shape = list(data.shape) centre_shape[i] = 1 cdf = np.cumsum(data,i) cdfnorm = cdf/np.max(cdf, i).reshape(centre_shape) icentre = np.argmin(abs(0.5 - cdfnorm), i) igamma1 = np.argmin(abs(0.75 - cdfnorm), i) igamma2 = np.argmin(abs(0.25 - cdfnorm), i) if isinstance(data, da.Array): icentre, igamma1, igamma2 = da.compute(icentre, igamma1, igamma2) centre = X[icentre] gamma = (X[igamma1] - X[igamma2]) / 2 height = data.max(i) return centre, height, gamma
def kendall_tau_1xn( x: da.Array, data: da.Array, value_range: Optional[Tuple[float, float]] = None, k: Optional[int] = None, ) -> Tuple[np.ndarray, np.ndarray]: """ Parameters ---------- x : da.Array data : da.Array value_range : Optional[Tuple[float, float]] = None k : Optional[int] = None """ _, ncols = data.shape corrs = [] for j in range(ncols): mask = ~(da.isnan(x) | da.isnan(data[:, j])) corr = dask.delayed(lambda a, b: kendalltau(a, b)[0])( np.array(x)[mask], np.array(data[:, j])[mask]) corrs.append(corr) (corrs, ) = da.compute(corrs) corrs = np.asarray(corrs) return corr_filter(corrs, value_range, k)
def test_dask_svd_self_consistent(m, n): a = np.random.rand(m, n) d_a = da.from_array(a, chunks=(3, n), name='A') d_u, d_s, d_vt = da.linalg.svd(d_a) u, s, vt = da.compute(d_u, d_s, d_vt) for d_e, e in zip([d_u, d_s, d_vt], [u, s, vt]): assert d_e.shape == e.shape assert d_e.dtype == e.dtype
def _calculate_summary_statistics(self): data = self._lazy_data() _raveled = data.ravel() _mean, _std, _min, _q1, _q2, _q3, _max = da.compute( da.nanmean(data), da.nanstd(data), da.nanmin(data), da.percentile(_raveled, [25, ]), da.percentile(_raveled, [50, ]), da.percentile(_raveled, [75, ]), da.nanmax(data), ) return _mean, _std, _min, _q1, _q2, _q3, _max
def dasky_scotts_bin_width(data, return_bins=True): r"""Dask version of scotts_bin_width Parameters ---------- data : dask array the data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is: .. math:: \Delta_b = \frac{3.5\sigma}{n^{1/3}} where :math:`\sigma` is the standard deviation of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width, freedman_bin_width, astroML.plotting.hist """ if not isinstance(data, da.Array): raise TypeError('data has to be a dask array') if data.ndim != 1: data = data.flatten() n = data.size sigma = da.nanstd(data) dx = 3.5 * sigma * 1. / (n ** (1. / 3)) c_dx, mx, mn = da.compute(dx, data.max(), data.min()) if return_bins: Nbins = np.ceil((mx - mn) * 1. / c_dx) Nbins = max(1, Nbins) bins = mn + c_dx * np.arange(Nbins + 1) return c_dx, bins else: return c_dx
def dasky_freedman_bin_width(data, return_bins=True): r"""Dask version of freedman_bin_width Parameters ---------- data : dask array the data return_bins : bool (optional) if True, then return the bin edges Returns ------- width : float optimal bin width using Scott's rule bins : ndarray bin edges: returned if `return_bins` is True Notes ----- The optimal bin width is .. math:: \Delta_b = \frac{2(q_{75} - q_{25})}{n^{1/3}} where :math:`q_{N}` is the :math:`N` percent quartile of the data, and :math:`n` is the number of data points. See Also -------- knuth_bin_width, scotts_bin_width, astroML.plotting.hist """ if not isinstance(data, da.Array): raise TypeError('data has to be a dask array') if data.ndim != 1: data = data.flatten() n = data.size v25, v75 = da.percentile(data, [25, 75]) dx = 2 * (v75 - v25) * 1. / (n ** (1. / 3)) c_dx, mx, mn = da.compute(dx, data.max(), data.min()) if return_bins: Nbins = np.ceil((mx - mn) * 1. / c_dx) Nbins = max(1, Nbins) bins = mn + c_dx * np.arange(Nbins + 1) return c_dx, bins else: return c_dx
def test_svd_compressed(): m, n = 300, 250 r = 10 np.random.seed(4321) mat1 = np.random.randn(m, r) mat2 = np.random.randn(r, n) mat = mat1.dot(mat2) data = da.from_array(mat, chunks=(50, 50)) n_iter = 6 for i in range(n_iter): u, s, vt = svd_compressed(data, r, seed=4321) u, s, vt = da.compute(u, s, vt) if i == 0: usvt = np.dot(u, np.dot(np.diag(s), vt)) else: usvt += np.dot(u, np.dot(np.diag(s), vt)) usvt /= n_iter tol = 2e-1 assert np.allclose(np.linalg.norm(mat - usvt), np.linalg.norm(mat), rtol=tol, atol=tol) # average accuracy check u, s, vt = svd_compressed(data, r, seed=4321) u, s, vt = da.compute(u, s, vt) u = u[:, :r] s = s[:r] vt = vt[:r, :] s_exact = np.linalg.svd(mat)[1] s_exact = s_exact[:r] assert np.allclose(np.eye(r, r), np.dot(u.T, u)) # u must be orthonormal assert np.allclose(np.eye(r, r), np.dot(vt, vt.T)) # v must be orthonormal assert np.allclose(s, s_exact) # s must contain the singular values
def _calculate_summary_statistics(self, rechunk=True): if rechunk is True: # Use dask auto rechunk instead of HyperSpy's one, what should be # better for these operations rechunk = "dask_auto" data = self._lazy_data(rechunk=rechunk) _raveled = data.ravel() _mean, _std, _min, _q1, _q2, _q3, _max = da.compute( da.nanmean(data), da.nanstd(data), da.nanmin(data), da.percentile(_raveled, [25, ]), da.percentile(_raveled, [50, ]), da.percentile(_raveled, [75, ]), da.nanmax(data), ) return _mean, _std, _min, _q1, _q2, _q3, _max
def dasky_histogram(a, bins=10, **kwargs): """Enhanced histogram for dask arrays. The range keyword is ignored. Reads the data at most two times - once to determine best bins (if required), and second time to actually calculate the histogram. Parameters ---------- a : array_like array of data to be histogrammed bins : int or list or str (optional) If bins is a string, then it must be one of: 'scotts' : use Scott's rule to determine bins 'freedman' : use the Freedman-Diaconis rule to determine bins other keyword arguments are described in numpy.hist(). Returns ------- hist : array The values of the histogram. See `normed` and `weights` for a description of the possible semantics. bin_edges : array of dtype float Return the bin edges ``(length(hist)+1)``. See Also -------- numpy.histogram astroML.plotting.hist """ if not isinstance(a, da.Array): raise TypeError('the given array has to be a dask.Array') if a.ndim != 1: a = a.flatten() if bins == 'scotts': _, bins = dasky_scotts_bin_width(a, True) elif bins == 'freedman': _, bins = dasky_freedman_bin_width(a, True) elif isinstance(bins, str): raise ValueError("unrecognized bin code: '%s'" % bins) elif not np.iterable(bins): with ProgressBar(): kwargs['range'] = da.compute(a.min(), a.max()) h, bins = da.histogram(a, bins=bins, **kwargs) with ProgressBar(): return h.compute(), bins
def _create_resample_kdtree(self): """Set up kd tree on input""" # Get input information valid_input_index, source_lons, source_lats = \ _get_valid_input_index_dask(self.source_geo_def, self.target_geo_def, self.reduce_data, self.radius_of_influence, nprocs=self.nprocs) # FIXME: Is dask smart enough to only compute the pixels we end up # using even with this complicated indexing input_coords = lonlat2xyz(source_lons, source_lats) valid_input_index = da.ravel(valid_input_index) input_coords = input_coords[valid_input_index, :] input_coords = input_coords.compute() # Build kd-tree on input input_coords = input_coords.astype(np.float) valid_input_index, input_coords = da.compute(valid_input_index, input_coords) return valid_input_index, KDTree(input_coords)
def compute(self, progressbar=True, close_file=False): """Attempt to store the full signal in memory. close_file: bool If True, attemp to close the file associated with the dask array data if any. Note that closing the file will make all other associated lazy signals inoperative. """ if progressbar: cm = ProgressBar else: cm = dummy_context_manager with cm(): da = self.data data = da.compute() if close_file: self.close_file() self.data = data self._lazy = False self._assign_subclass()
def _estimate_gaussian_parameters(signal, x1, x2, only_current): axis = signal.axes_manager.signal_axes[0] i1, i2 = axis.value_range_to_indices(x1, x2) X = axis.axis[i1:i2] if only_current is True: data = signal()[i1:i2] X_shape = (len(X),) i = 0 centre_shape = (1,) else: i = axis.index_in_array data_gi = [slice(None), ] * len(signal.data.shape) data_gi[axis.index_in_array] = slice(i1, i2) data = signal.data[tuple(data_gi)] X_shape = [1, ] * len(signal.data.shape) X_shape[axis.index_in_array] = data.shape[i] centre_shape = list(data.shape) centre_shape[i] = 1 if isinstance(data, da.Array): _sum = da.sum _sqrt = da.sqrt _abs = da.numpy_compat.builtins.abs else: _sum = np.sum _sqrt = np.sqrt _abs = np.abs centre = _sum(X.reshape(X_shape) * data, i) / _sum(data, i) sigma = _sqrt(_abs(_sum((X.reshape(X_shape) - centre.reshape( centre_shape)) ** 2 * data, i) / _sum(data, i))) height = data.max(i) if isinstance(data, da.Array): return da.compute(centre, height, sigma) else: return centre, height, sigma
def decomposition(self, output_dimension, normalize_poissonian_noise=False, algorithm='PCA', signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=True, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- output_dimension : int the number of significant components to keep normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA from scikit-learn is run. get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. bounds : {tuple, bool} The (min, max) values of the data to normalize before learning. If tuple (min, max), those values will be used for normalization. If True, extremes will be looked up (expensive), default. If False, no normalization is done (learning may be very slow). If normalize_poissonian_noise is True, this cannot be True. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks ## LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) else: raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: if bounds is True: bounds = False # warnings.warn? data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros( self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array( navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros( self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array( signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=range(ndim)), data.sum(axis=range(ndim, ndim + sdim))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, )*rbH.ndim] *\ rbH[(None, )*raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # normalize the data for learning algs: if bounds: if bounds is True: _min, _max = da.compute(self.data.min(), self.data.max()) else: _min, _max = bounds self.data = (self.data - _min) / (_max - _min) # LEARN this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform post = lambda a: np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] post = lambda a: obj.finish()[4] elif algorithm == 'ONMF': method = obj.project post = lambda a: np.concatenate(a, axis=1).T _map = map(lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar( _map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension try: loadings = _reshuffle_mixed_blocks( loadings, ndim, (output_dimension,), nav_chunks).reshape((-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio
def estimate_image_shift(ref, image, roi=None, sobel=True, medfilter=True, hanning=True, plot=False, dtype='float', normalize_corr=False, return_maxval=True): """Estimate the shift in a image using phase correlation This method can only estimate the shift by comparing bidimensional features that should not change the position in the given axis. To decrease the memory usage, the time of computation and the accuracy of the results it is convenient to select a region of interest by setting the roi keyword. Parameters ---------- roi : tuple of ints (top, bottom, left, right) Define the region of interest sobel : bool apply a sobel filter for edge enhancement medfilter : bool apply a median filter for noise reduction hanning : bool Apply a 2d hanning filter plot : bool | matplotlib.Figure If True, plots the images after applying the filters and the phase correlation. If a figure instance, the images will be plotted to the given figure. reference : \'current\' | \'cascade\' If \'current\' (default) the image at the current coordinates is taken as reference. If \'cascade\' each image is aligned with the previous one. dtype : str or dtype Typecode or data-type in which the calculations must be performed. normalize_corr : bool If True use phase correlation instead of standard correlation Returns ------- shifts: np.array containing the estimate shifts max_value : float The maximum value of the correlation """ ref, image = da.compute(ref, image) # Make a copy of the images to avoid modifying them ref = ref.copy().astype(dtype) image = image.copy().astype(dtype) if roi is not None: top, bottom, left, right = roi else: top, bottom, left, right = [None, ] * 4 # Select region of interest ref = ref[top:bottom, left:right] image = image[top:bottom, left:right] # Apply filters for im in (ref, image): if hanning is True: im *= hanning2d(*im.shape) if medfilter is True: im[:] = sp.signal.medfilt(im) if sobel is True: im[:] = sobel_filter(im) phase_correlation = fft_correlation(ref, image, normalize=normalize_corr) # Estimate the shift by getting the coordinates of the maximum argmax = np.unravel_index(np.argmax(phase_correlation), phase_correlation.shape) threshold = (phase_correlation.shape[0] / 2 - 1, phase_correlation.shape[1] / 2 - 1) shift0 = argmax[0] if argmax[0] < threshold[0] else \ argmax[0] - phase_correlation.shape[0] shift1 = argmax[1] if argmax[1] < threshold[1] else \ argmax[1] - phase_correlation.shape[1] max_val = phase_correlation.max() # Plot on demand if plot is True or isinstance(plot, plt.Figure): if isinstance(plot, plt.Figure): f = plot axarr = plot.axes if len(axarr) < 3: for i in range(3): f.add_subplot(1, 3, i) axarr = plot.axes else: f, axarr = plt.subplots(1, 3) full_plot = len(axarr[0].images) == 0 if full_plot: axarr[0].set_title('Reference') axarr[1].set_title('Image') axarr[2].set_title('Phase correlation') axarr[0].imshow(ref) axarr[1].imshow(image) d = (np.array(phase_correlation.shape) - 1) // 2 extent = [-d[1], d[1], -d[0], d[0]] axarr[2].imshow(np.fft.fftshift(phase_correlation), extent=extent) plt.show() else: axarr[0].images[0].set_data(ref) axarr[1].images[0].set_data(image) axarr[2].images[0].set_data(np.fft.fftshift(phase_correlation)) # TODO: Renormalize images f.canvas.draw() # Liberate the memory. It is specially necessary if it is a # memory map del ref del image if return_maxval: return -np.array((shift0, shift1)), max_val else: return -np.array((shift0, shift1))
def get_reflectance(self, sun_zenith, sat_zenith, azidiff, bandname, redband=None): """Get the reflectance from the three sun-sat angles""" # Get wavelength in nm for band: if isinstance(bandname, float): LOG.warning('A wavelength is provided instead of band name - ' + 'disregard the relative spectral responses and assume ' + 'it is the effective wavelength: %f (micro meter)', bandname) wvl = bandname * 1000.0 else: wvl = self.get_effective_wavelength(bandname) wvl = wvl * 1000.0 rayl, wvl_coord, azid_coord, satz_sec_coord, sunz_sec_coord = self.get_reflectance_lut() # force dask arrays compute = False if HAVE_DASK and not isinstance(sun_zenith, Array): compute = True sun_zenith = from_array(sun_zenith, chunks=sun_zenith.shape) sat_zenith = from_array(sat_zenith, chunks=sat_zenith.shape) azidiff = from_array(azidiff, chunks=azidiff.shape) if redband is not None: redband = from_array(redband, chunks=redband.shape) clip_angle = rad2deg(arccos(1. / sunz_sec_coord.max())) sun_zenith = clip(sun_zenith, 0, clip_angle) sunzsec = 1. / cos(deg2rad(sun_zenith)) clip_angle = rad2deg(arccos(1. / satz_sec_coord.max())) sat_zenith = clip(sat_zenith, 0, clip_angle) satzsec = 1. / cos(deg2rad(sat_zenith)) shape = sun_zenith.shape if not(wvl_coord.min() < wvl < wvl_coord.max()): LOG.warning( "Effective wavelength for band %s outside 400-800 nm range!", str(bandname)) LOG.info( "Set the rayleigh/aerosol reflectance contribution to zero!") if HAVE_DASK: chunks = sun_zenith.chunks if redband is None else redband.chunks res = zeros(shape, chunks=chunks) return res.compute() if compute else res else: return zeros(shape) idx = np.searchsorted(wvl_coord, wvl) wvl1 = wvl_coord[idx - 1] wvl2 = wvl_coord[idx] fac = (wvl2 - wvl) / (wvl2 - wvl1) raylwvl = fac * rayl[idx - 1, :, :, :] + (1 - fac) * rayl[idx, :, :, :] tic = time.time() smin = [sunz_sec_coord[0], azid_coord[0], satz_sec_coord[0]] smax = [sunz_sec_coord[-1], azid_coord[-1], satz_sec_coord[-1]] orders = [ len(sunz_sec_coord), len(azid_coord), len(satz_sec_coord)] f_3d_grid = atleast_2d(raylwvl.ravel()) if HAVE_DASK and isinstance(smin[0], Array): # compute all of these at the same time before passing to the interpolator # otherwise they are computed separately smin, smax, orders, f_3d_grid = da.compute(smin, smax, orders, f_3d_grid) minterp = MultilinearInterpolator(smin, smax, orders) minterp.set_values(f_3d_grid) if HAVE_DASK: ipn = map_blocks(self._do_interp, minterp, sunzsec, azidiff, satzsec, dtype=raylwvl.dtype, chunks=azidiff.chunks) else: ipn = self._do_interp(minterp, sunzsec, azidiff, satzsec) LOG.debug("Time - Interpolation: {0:f}".format(time.time() - tic)) ipn *= 100 res = ipn if redband is not None: res = where(redband < 20., res, (1 - (redband - 20) / 80) * res) res = clip(res, 0, 100) if compute: res = res.compute() return res
def estimate_image_shift(ref, image, roi=None, sobel=True, medfilter=True, hanning=True, plot=False, dtype='float', normalize_corr=False, sub_pixel_factor=1, return_maxval=True): """Estimate the shift in a image using phase correlation This method can only estimate the shift by comparing bidimensional features that should not change the position in the given axis. To decrease the memory usage, the time of computation and the accuracy of the results it is convenient to select a region of interest by setting the roi keyword. Parameters ---------- ref : 2D numpy.ndarray Reference image image : 2D numpy.ndarray Image to register roi : tuple of ints (top, bottom, left, right) Define the region of interest sobel : bool apply a sobel filter for edge enhancement medfilter : bool apply a median filter for noise reduction hanning : bool Apply a 2d hanning filter plot : bool | matplotlib.Figure If True, plots the images after applying the filters and the phase correlation. If a figure instance, the images will be plotted to the given figure. reference : 'current' | 'cascade' If 'current' (default) the image at the current coordinates is taken as reference. If 'cascade' each image is aligned with the previous one. dtype : str or dtype Typecode or data-type in which the calculations must be performed. normalize_corr : bool If True use phase correlation instead of standard correlation sub_pixel_factor : float Estimate shifts with a sub-pixel accuracy of 1/sub_pixel_factor parts of a pixel. Default is 1, i.e. no sub-pixel accuracy. Returns ------- shifts: np.array containing the estimate shifts max_value : float The maximum value of the correlation Notes ----- The statistical analysis approach to the translation estimation when using `reference`='stat' roughly follows [1]_ . If you use it please cite their article. References ---------- .. [1] Bernhard Schaffer, Werner Grogger and Gerald Kothleitner. “Automated Spatial Drift Correction for EFTEM Image Series.” Ultramicroscopy 102, no. 1 (December 2004): 27–36. """ ref, image = da.compute(ref, image) # Make a copy of the images to avoid modifying them ref = ref.copy().astype(dtype) image = image.copy().astype(dtype) if roi is not None: top, bottom, left, right = roi else: top, bottom, left, right = [None, ] * 4 # Select region of interest ref = ref[top:bottom, left:right] image = image[top:bottom, left:right] # Apply filters for im in (ref, image): if hanning is True: im *= hanning2d(*im.shape) if medfilter is True: im[:] = sp.signal.medfilt(im) if sobel is True: im[:] = sobel_filter(im) phase_correlation, image_product = fft_correlation( ref, image, normalize=normalize_corr) # Estimate the shift by getting the coordinates of the maximum argmax = np.unravel_index(np.argmax(phase_correlation), phase_correlation.shape) threshold = (phase_correlation.shape[0] / 2 - 1, phase_correlation.shape[1] / 2 - 1) shift0 = argmax[0] if argmax[0] < threshold[0] else \ argmax[0] - phase_correlation.shape[0] shift1 = argmax[1] if argmax[1] < threshold[1] else \ argmax[1] - phase_correlation.shape[1] max_val = phase_correlation.real.max() shifts = np.array((shift0, shift1)) # The following code is more or less copied from # skimage.feature.register_feature, to gain access to the maximum value: if sub_pixel_factor != 1: # Initial shift estimate in upsampled grid shifts = np.round(shifts * sub_pixel_factor) / sub_pixel_factor upsampled_region_size = np.ceil(sub_pixel_factor * 1.5) # Center of output array at dftshift + 1 dftshift = np.fix(upsampled_region_size / 2.0) sub_pixel_factor = np.array(sub_pixel_factor, dtype=np.float64) normalization = (image_product.size * sub_pixel_factor ** 2) # Matrix multiply DFT around the current shift estimate sample_region_offset = dftshift - shifts * sub_pixel_factor correlation = _upsampled_dft(image_product.conj(), upsampled_region_size, sub_pixel_factor, sample_region_offset).conj() correlation /= normalization # Locate maximum and map back to original pixel grid maxima = np.array(np.unravel_index( np.argmax(np.abs(correlation)), correlation.shape), dtype=np.float64) maxima -= dftshift shifts = shifts + maxima / sub_pixel_factor max_val = correlation.real.max() # Plot on demand if plot is True or isinstance(plot, plt.Figure): if isinstance(plot, plt.Figure): fig = plot axarr = plot.axes if len(axarr) < 3: for i in range(3): fig.add_subplot(1, 3, i + 1) axarr = fig.axes else: fig, axarr = plt.subplots(1, 3) full_plot = len(axarr[0].images) == 0 if full_plot: axarr[0].set_title('Reference') axarr[1].set_title('Image') axarr[2].set_title('Phase correlation') axarr[0].imshow(ref) axarr[1].imshow(image) d = (np.array(phase_correlation.shape) - 1) // 2 extent = [-d[1], d[1], -d[0], d[0]] axarr[2].imshow(np.fft.fftshift(phase_correlation), extent=extent) plt.show() else: axarr[0].images[0].set_data(ref) axarr[1].images[0].set_data(image) axarr[2].images[0].set_data(np.fft.fftshift(phase_correlation)) # TODO: Renormalize images fig.canvas.draw_idle() # Liberate the memory. It is specially necessary if it is a # memory map del ref del image if return_maxval: return -shifts, max_val else: return -shifts
def decomposition(self, normalize_poissonian_noise=False, algorithm='svd', output_dimension=None, signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=False, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('svd', 'PCA', 'ORPCA', 'ONMF'). By default 'svd', lazy SVD decomposition from dask. output_dimension : int the number of significant components to keep. If None, keep all (only valid for SVD) get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ if bounds: msg = ( "The `bounds` keyword is deprecated and will be removed " "in v2.0. Since version > 1.3 this has no effect.") warnings.warn(msg, VisibleDeprecationWarning) explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if algorithm != "svd" and output_dimension is None: raise ValueError("With the %s the output_dimension " "must be specified" % algorithm) if output_dimension and blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) elif algorithm != "svd": raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros( self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array( navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros( self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array( signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute( data.sum(axis=tuple(range(ndim))), data.sum(axis=tuple(range(ndim, ndim + sdim)))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # LEARN if algorithm == "svd": reproject = False from dask.array.linalg import svd try: self._unfolded4decomposition = self.unfold() # TODO: implement masking if navigation_mask or signal_mask: raise NotImplemented( "Masking is not yet implemented for lazy SVD." ) U, S, V = svd(self.data) factors = V.T explained_variance = S ** 2 / self.data.shape[0] loadings = U * S finally: if self._unfolded4decomposition is True: self.fold() self._unfolded4decomposition is False else: this_data = [] try: for chunk in progressbar( self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform def post(a): return np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] def post(a): return obj.finish()[4] elif algorithm == 'ONMF': method = obj.project def post(a): return np.concatenate(a, axis=1).T _map = map(lambda thing: method(thing), self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar( _map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension if algorithm != "svd": # Only needed for online algorithms try: loadings = _reshuffle_mixed_blocks( loadings, ndim, (output_dimension,), nav_chunks).reshape((-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension if algorithm != "svd": target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio # Rescale the results if the noise was normalized if normalize_poissonian_noise is True: target.factors = target.factors * rbH.ravel()[:, np.newaxis] target.loadings = target.loadings * raG.ravel()[:, np.newaxis]