def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs): """ Read dask Dataframe from bcolz.ctable Parameters ---------- x : bcolz.ctable Input data chunksize : int (optional) The size of blocks to pull out from ctable. Ideally as large as can comfortably fit in memory categorize : bool (defaults to True) Automatically categorize all string dtypes index : string (optional) Column to make the index See Also -------- from_array: more generic function not optimized for bcolz """ import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names),)) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:] if divisions[-1] != len(x) - 1: divisions = divisions + (len(x) - 1,) new_name = 'from_bcolz' + next(tokens) dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize),), None, categories)) for i in range(0, int(ceil(len(x) / chunksize)))) result = DataFrame(dsk, new_name, columns, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names),)) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = da.percentile(a, q).compute() return set_partition(result, index, divisions, **kwargs) else: return result
def test_grid_search_dask_inputs(): # Numpy versions np_X, np_y = make_classification(n_samples=15, n_classes=2, random_state=0) np_groups = np.random.RandomState(0).randint(0, 3, 15) # Dask array versions da_X = da.from_array(np_X, chunks=5) da_y = da.from_array(np_y, chunks=5) da_groups = da.from_array(np_groups, chunks=5) # Delayed versions del_X = delayed(np_X) del_y = delayed(np_y) del_groups = delayed(np_groups) cv = GroupKFold() clf = SVC(random_state=0) grid = {'C': [1]} sol = SVC(C=1, random_state=0).fit(np_X, np_y).support_vectors_ for X, y, groups in product([np_X, da_X, del_X], [np_y, da_y, del_y], [np_groups, da_groups, del_groups]): gs = dcv.GridSearchCV(clf, grid, cv=cv) with pytest.raises(ValueError) as exc: gs.fit(X, y) assert "The groups parameter should not be None" in str(exc.value) gs.fit(X, y, groups=groups) np.testing.assert_allclose(sol, gs.best_estimator_.support_vectors_)
def test_apply_dask_multiple_inputs(): import dask.array as da def covariance(x, y): return ((x - x.mean(axis=-1, keepdims=True)) * (y - y.mean(axis=-1, keepdims=True))).mean(axis=-1) rs = np.random.RandomState(42) array1 = da.from_array(rs.randn(4, 4), chunks=(2, 4)) array2 = da.from_array(rs.randn(4, 4), chunks=(2, 4)) data_array_1 = xr.DataArray(array1, dims=('x', 'z')) data_array_2 = xr.DataArray(array2, dims=('y', 'z')) expected = apply_ufunc( covariance, data_array_1.compute(), data_array_2.compute(), input_core_dims=[['z'], ['z']]) allowed = apply_ufunc( covariance, data_array_1, data_array_2, input_core_dims=[['z'], ['z']], dask='allowed') assert isinstance(allowed.data, da.Array) xr.testing.assert_allclose(expected, allowed.compute()) parallelized = apply_ufunc( covariance, data_array_1, data_array_2, input_core_dims=[['z'], ['z']], dask='parallelized', output_dtypes=[float]) assert isinstance(parallelized.data, da.Array) xr.testing.assert_allclose(expected, parallelized.compute())
def test_solve(shape, chunk): np.random.seed(1) A = np.random.random_integers(1, 10, (shape, shape)) dA = da.from_array(A, (chunk, chunk)) # vector b = np.random.random_integers(1, 10, shape) db = da.from_array(b, chunk) res = da.linalg.solve(dA, db) assert_eq(res, scipy.linalg.solve(A, b)) assert_eq(dA.dot(res), b.astype(float)) # tall-and-skinny matrix b = np.random.random_integers(1, 10, (shape, 5)) db = da.from_array(b, (chunk, 5)) res = da.linalg.solve(dA, db) assert_eq(res, scipy.linalg.solve(A, b)) assert_eq(dA.dot(res), b.astype(float)) # matrix b = np.random.random_integers(1, 10, (shape, shape)) db = da.from_array(b, (chunk, chunk)) res = da.linalg.solve(dA, db) assert_eq(res, scipy.linalg.solve(A, b)) assert_eq(dA.dot(res), b.astype(float))
def test_lu_1(): A1 = np.array([[7, 3, -1, 2], [3, 8, 1, -4], [-1, 1, 4, -1], [2, -4, -1, 6] ]) A2 = np.array([[7, 0, 0, 0, 0, 0], [0, 8, 0, 0, 0, 0], [0, 0, 4, 0, 0, 0], [0, 0, 0, 6, 0, 0], [0, 0, 0, 0, 3, 0], [0, 0, 0, 0, 0, 5]]) # without shuffle for A, chunk in zip([A1, A2], [2, 2]): dA = da.from_array(A, chunks=(chunk, chunk)) p, l, u = scipy.linalg.lu(A) dp, dl, du = da.linalg.lu(dA) assert_eq(p, dp) assert_eq(l, dl) assert_eq(u, du) _check_lu_result(dp, dl, du, A) A3 = np.array([[ 7, 3, 2, 1, 4, 1], [ 7, 11, 5, 2, 5, 2], [21, 25, 16, 10, 16, 5], [21, 41, 18, 13, 16, 11], [14, 46, 23, 24, 21, 22], [ 0, 56, 29, 17, 14, 8]]) # with shuffle for A, chunk in zip([A3], [2]): dA = da.from_array(A, chunks=(chunk, chunk)) p, l, u = scipy.linalg.lu(A) dp, dl, du = da.linalg.lu(dA) _check_lu_result(dp, dl, du, A)
def test_tsqr_zero_height_chunks(): m_q = 10 n_q = 5 m_r = 5 n_r = 5 # certainty mat = np.random.rand(10, 5) x = da.from_array(mat, chunks=((4, 0, 1, 0, 5), (5,))) q, r = da.linalg.qr(x) assert_eq((m_q, n_q), q.shape) # shape check assert_eq((m_r, n_r), r.shape) # shape check assert_eq(mat, da.dot(q, r)) # accuracy check assert_eq(np.eye(n_q, n_q), da.dot(q.T, q)) # q must be orthonormal assert_eq(r, da.triu(r.rechunk(r.shape[0]))) # r must be upper triangular # uncertainty mat2 = np.vstack([mat, -np.ones((10, 5))]) v2 = mat2[:, 0] x2 = da.from_array(mat2, chunks=5) c = da.from_array(v2, chunks=5) x = x2[c >= 0, :] # remove the ones added above to yield mat q, r = da.linalg.qr(x) q = q.compute() # because uncertainty r = r.compute() assert_eq((m_q, n_q), q.shape) # shape check assert_eq((m_r, n_r), r.shape) # shape check assert_eq(mat, np.dot(q, r)) # accuracy check assert_eq(np.eye(n_q, n_q), np.dot(q.T, q)) # q must be orthonormal assert_eq(r, np.triu(r)) # r must be upper triangular
def get_reflectance_lut(filename): """Read the LUT with reflectances as a function of wavelength, satellite zenith secant, azimuth difference angle, and sun zenith secant """ h5f = h5py.File(filename, 'r') tab = h5f['reflectance'] wvl = h5f['wavelengths'] azidiff = h5f['azimuth_difference'] satellite_zenith_secant = h5f['satellite_zenith_secant'] sun_zenith_secant = h5f['sun_zenith_secant'] if HAVE_DASK: tab = from_array(tab, chunks=(10, 10, 10, 10)) wvl = wvl[:] # no benefit to dask-ifying this azidiff = from_array(azidiff, chunks=(1000,)) satellite_zenith_secant = from_array(satellite_zenith_secant, chunks=(1000,)) sun_zenith_secant = from_array(sun_zenith_secant, chunks=(1000,)) else: # load all of the data we are going to use in to memory tab = tab[:] wvl = wvl[:] azidiff = azidiff[:] satellite_zenith_secant = satellite_zenith_secant[:] sun_zenith_secant = sun_zenith_secant[:] h5f.close() return tab, wvl, azidiff, satellite_zenith_secant, sun_zenith_secant
def test_solve_sym_pos(shape, chunk): np.random.seed(1) A = _get_symmat(shape) dA = da.from_array(A, (chunk, chunk)) # vector b = np.random.randint(1, 10, shape) db = da.from_array(b, chunk) res = da.linalg.solve(dA, db, sym_pos=True) assert_eq(res, scipy.linalg.solve(A, b, sym_pos=True)) assert_eq(dA.dot(res), b.astype(float)) # tall-and-skinny matrix b = np.random.randint(1, 10, (shape, 5)) db = da.from_array(b, (chunk, 5)) res = da.linalg.solve(dA, db, sym_pos=True) assert_eq(res, scipy.linalg.solve(A, b, sym_pos=True)) assert_eq(dA.dot(res), b.astype(float)) # matrix b = np.random.randint(1, 10, (shape, shape)) db = da.from_array(b, (chunk, chunk)) res = da.linalg.solve(dA, db, sym_pos=True) assert_eq(res, scipy.linalg.solve(A, b, sym_pos=True)) assert_eq(dA.dot(res), b.astype(float))
def test_insert(): x = np.random.randint(10, size=(10, 10)) a = da.from_array(x, chunks=(5, 5)) y = np.random.randint(10, size=(5, 10)) b = da.from_array(y, chunks=(4, 4)) assert_eq(np.insert(x, 0, -1, axis=0), da.insert(a, 0, -1, axis=0)) assert_eq(np.insert(x, 3, -1, axis=-1), da.insert(a, 3, -1, axis=-1)) assert_eq(np.insert(x, 5, -1, axis=1), da.insert(a, 5, -1, axis=1)) assert_eq(np.insert(x, -1, -1, axis=-2), da.insert(a, -1, -1, axis=-2)) assert_eq(np.insert(x, [2, 3, 3], -1, axis=1), da.insert(a, [2, 3, 3], -1, axis=1)) assert_eq(np.insert(x, [2, 3, 8, 8, -2, -2], -1, axis=0), da.insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0)) assert_eq(np.insert(x, slice(1, 4), -1, axis=1), da.insert(a, slice(1, 4), -1, axis=1)) assert_eq(np.insert(x, [2] * 3 + [5] * 2, y, axis=0), da.insert(a, [2] * 3 + [5] * 2, b, axis=0)) assert_eq(np.insert(x, 0, y[0], axis=1), da.insert(a, 0, b[0], axis=1)) assert same_keys(da.insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0), da.insert(a, [2, 3, 8, 8, -2, -2], -1, axis=0)) with pytest.raises(NotImplementedError): da.insert(a, [4, 2], -1, axis=0) with pytest.raises(IndexError): da.insert(a, [3], -1, axis=2) with pytest.raises(IndexError): da.insert(a, [3], -1, axis=-3)
def test_apply_gufunc_elemwise_01(): def add(x, y): return x + y a = da.from_array(np.array([1, 2, 3]), chunks=2, name='a') b = da.from_array(np.array([1, 2, 3]), chunks=2, name='b') z = apply_gufunc(add, "(),()->()", a, b, output_dtypes=a.dtype) assert_eq(z, np.array([2, 4, 6]))
def test_isclose(): x = np.array([0, np.nan, 1, 1.5]) y = np.array([1e-9, np.nan, 1, 2]) a = da.from_array(x, chunks=(2,)) b = da.from_array(y, chunks=(2,)) assert_eq(da.isclose(a, b, equal_nan=True), np.isclose(x, y, equal_nan=True))
def test_dot_method(): x = np.arange(400).reshape((20, 20)) a = da.from_array(x, chunks=(5, 5)) y = np.arange(200).reshape((20, 10)) b = da.from_array(y, chunks=(5, 5)) assert_eq(a.dot(b), x.dot(y))
def test_tree_reduce_depth(): # 2D x = da.from_array(np.arange(242).reshape((11, 22)), chunks=(3, 4)) thresh = {0: 2, 1: 3} assert_max_deps(x.sum(split_every=thresh), 2 * 3) assert_max_deps(x.sum(axis=0, split_every=thresh), 2) assert_max_deps(x.sum(axis=1, split_every=thresh), 3) assert_max_deps(x.sum(split_every=20), 20, False) assert_max_deps(x.sum(axis=0, split_every=20), 4) assert_max_deps(x.sum(axis=1, split_every=20), 6) # 3D x = da.from_array(np.arange(11 * 22 * 29).reshape((11, 22, 29)), chunks=(3, 4, 5)) thresh = {0: 2, 1: 3, 2: 4} assert_max_deps(x.sum(split_every=thresh), 2 * 3 * 4) assert_max_deps(x.sum(axis=0, split_every=thresh), 2) assert_max_deps(x.sum(axis=1, split_every=thresh), 3) assert_max_deps(x.sum(axis=2, split_every=thresh), 4) assert_max_deps(x.sum(axis=(0, 1), split_every=thresh), 2 * 3) assert_max_deps(x.sum(axis=(0, 2), split_every=thresh), 2 * 4) assert_max_deps(x.sum(axis=(1, 2), split_every=thresh), 3 * 4) assert_max_deps(x.sum(split_every=20), 20, False) assert_max_deps(x.sum(axis=0, split_every=20), 4) assert_max_deps(x.sum(axis=1, split_every=20), 6) assert_max_deps(x.sum(axis=2, split_every=20), 6) assert_max_deps(x.sum(axis=(0, 1), split_every=20), 20, False) assert_max_deps(x.sum(axis=(0, 2), split_every=20), 20, False) assert_max_deps(x.sum(axis=(1, 2), split_every=20), 20, False) assert_max_deps(x.sum(axis=(0, 1), split_every=40), 4 * 6) assert_max_deps(x.sum(axis=(0, 2), split_every=40), 4 * 6) assert_max_deps(x.sum(axis=(1, 2), split_every=40), 6 * 6)
def get_dataset(self, key, info): if self.reader is None: with open(self.filename) as fdes: data = fdes.read(3) if data in ["CMS", "NSS", "UKM", "DSS"]: reader = GACKLMReader self.chn_dict = AVHRR3_CHANNEL_NAMES else: reader = GACPODReader self.chn_dict = AVHRR_CHANNEL_NAMES self.reader = reader() self.reader.read(self.filename) if key.name in ['latitude', 'longitude']: if self.reader.lons is None or self.reader.lats is None: #self.reader.get_lonlat(clock_drift_adjust=False) self.reader.get_lonlat() if key.name == 'latitude': return xr.DataArray(da.from_array(self.reader.lats, chunks=1000), dims=['y', 'x'], attrs=info) else: return xr.DataArray(da.from_array(self.reader.lons, chunks=1000), dims=['y', 'x'], attrs=info) if self.channels is None: self.channels = self.reader.get_calibrated_channels() data = self.channels[:, :, self.chn_dict[key.name]] return xr.DataArray(da.from_array(data, chunks=1000), dims=['y', 'x'], attrs=info)
def test_complex(ufunc): dafunc = getattr(da, ufunc) npfunc = getattr(np, ufunc) real = np.random.randint(1, 100, size=(20, 20)) imag = np.random.randint(1, 100, size=(20, 20)) * 1j comp = real + imag dareal = da.from_array(real, 3) daimag = da.from_array(imag, 3) dacomp = da.from_array(comp, 3) assert_eq(dacomp.real, comp.real) assert_eq(dacomp.imag, comp.imag) assert_eq(dacomp.conj(), comp.conj()) for darr, arr in [(dacomp, comp), (dareal, real), (daimag, imag)]: # applying Dask ufunc doesn't trigger computation assert isinstance(dafunc(darr), da.Array) assert_eq(dafunc(darr), npfunc(arr)) assert_eq(npfunc(darr), npfunc(arr)) # applying Dask ufunc to normal ndarray triggers computation assert isinstance(dafunc(arr), np.ndarray) assert_eq(dafunc(arr), npfunc(arr))
def coords_all_dtypes_and_lazynesses(self, coord_class): # Generate coords with all possible types of points and bounds, and all # of the given dtypes. points_types = ['real', 'lazy'] bounds_types = ['no', 'real', 'lazy'] # Test a few specific combinations of points+bounds dtypes, including # cases where they are different. dtype_pairs = [(np.float64, np.float64), (np.int16, np.int16), (np.int16, np.float32), (np.float64, np.int32)] for pts_dtype, bds_dtype in dtype_pairs: for points_type_name in points_types: for bounds_type_name in bounds_types: pts = np.asarray(self.pts_real, dtype=pts_dtype) bds = np.asarray(self.bds_real, dtype=bds_dtype) if points_type_name == 'lazy': pts = da.from_array(pts, pts.shape) if bounds_type_name == 'lazy': bds = da.from_array(bds, bds.shape) elif bounds_type_name == 'no': bds = None coord = coord_class(pts, bounds=bds) result = (coord, points_type_name, bounds_type_name) yield result
def test_index_with_int_dask_array_0d(chunks): # Slice by 0-dimensional array x = da.from_array([[10, 20, 30], [40, 50, 60]], chunks=chunks) idx0 = da.from_array(1, chunks=1) assert_eq(x[idx0, :], x[1, :]) assert_eq(x[:, idx0], x[:, 1])
def test_apply_gufunc_elemwise_01b(): def add(x, y): return x + y a = da.from_array(np.array([1, 2, 3]), chunks=2, name='a') b = da.from_array(np.array([1, 2, 3]), chunks=1, name='b') with pytest.raises(ValueError): apply_gufunc(add, "(),()->()", a, b, output_dtypes=a.dtype)
def test_lstsq(nrow, ncol, chunk): np.random.seed(1) A = np.random.randint(1, 20, (nrow, ncol)) b = np.random.randint(1, 20, nrow) dA = da.from_array(A, (chunk, ncol)) db = da.from_array(b, chunk) x, r, rank, s = np.linalg.lstsq(A, b) dx, dr, drank, ds = da.linalg.lstsq(dA, db) assert_eq(dx, x) assert_eq(dr, r) assert drank.compute() == rank assert_eq(ds, s) # reduce rank causes multicollinearity, only compare rank A[:, 1] = A[:, 2] dA = da.from_array(A, (chunk, ncol)) db = da.from_array(b, chunk) x, r, rank, s = np.linalg.lstsq(A, b, rcond=np.finfo(np.double).eps * max(nrow, ncol)) assert rank == ncol - 1 dx, dr, drank, ds = da.linalg.lstsq(dA, db) assert drank.compute() == rank
def test_lstsq(nrow, ncol, chunk): import scipy.linalg np.random.seed(1) A = np.random.random_integers(1, 20, (nrow, ncol)) b = np.random.random_integers(1, 20, nrow) dA = da.from_array(A, (chunk, ncol)) db = da.from_array(b, chunk) x, r, rank, s = np.linalg.lstsq(A, b) dx, dr, drank, ds = da.linalg.lstsq(dA, db) assert_eq(dx, x) assert_eq(dr, r) assert drank.compute() == rank assert_eq(ds, s) # reduce rank causes multicollinearity, only compare rank A[:, 1] = A[:, 2] dA = da.from_array(A, (chunk, ncol)) db = da.from_array(b, chunk) x, r, rank, s = np.linalg.lstsq(A, b) assert rank == ncol - 1 dx, dr, drank, ds = da.linalg.lstsq(dA, db) assert drank.compute() == rank
def test_index_with_int_dask_array_indexerror(chunks): a = da.arange(4, chunks=chunks) idx = da.from_array([4], chunks=1) with pytest.raises(IndexError): a[idx].compute() idx = da.from_array([-5], chunks=1) with pytest.raises(IndexError): a[idx].compute()
def test_elemwise_consistent_names(): a = da.from_array(np.arange(5, dtype='f4'), chunks=(2,)) b = da.from_array(np.arange(5, dtype='f4'), chunks=(2,)) assert same_keys(a + b, a + b) assert same_keys(a + 2, a + 2) assert same_keys(da.exp(a), da.exp(a)) assert same_keys(da.exp(a, dtype='f8'), da.exp(a, dtype='f8')) assert same_keys(da.maximum(a, b), da.maximum(a, b))
def test_tril_triu_errors(): A = np.random.randint(0, 11, (10, 10, 10)) dA = da.from_array(A, chunks=(5, 5, 5)) pytest.raises(ValueError, lambda: da.triu(dA)) A = np.random.randint(0, 11, (30, 35)) dA = da.from_array(A, chunks=(5, 5)) pytest.raises(NotImplementedError, lambda: da.triu(dA))
def test_index_with_dask_array(): x = np.arange(36).reshape((6, 6)) d = da.from_array(x, chunks=(3, 3)) ind = np.asarray([True, True, False, True, False, False], dtype=bool) ind = da.from_array(ind, chunks=2) for index in [ind, (slice(1, 9, 2), ind), (ind, slice(2, 8, 1))]: x_index = dask.compute(index)[0] assert_eq(x[x_index], d[index])
def test_bincount_with_weights(): x = np.array([2, 1, 5, 2, 1]) d = da.from_array(x, chunks=2) weights = np.array([1, 2, 1, 0.5, 1]) dweights = da.from_array(weights, chunks=2) assert eq(da.bincount(d, weights=dweights, minlength=6), np.bincount(x, weights=dweights, minlength=6))
def test_apply_gufunc_elemwise_02(): def addmul(x, y): assert x.shape in ((2,), (1,)) return x + y, x * y a = da.from_array(np.array([1, 2, 3]), chunks=2, name='a') b = da.from_array(np.array([1, 2, 3]), chunks=2, name='b') z1, z2 = apply_gufunc(addmul, "(),()->(),()", a, b, output_dtypes=2 * (a.dtype,)) assert_eq(z1, np.array([2, 4, 6])) assert_eq(z2, np.array([1, 4, 9]))
def test_bincount_with_weights(): x = np.array([2, 1, 5, 2, 1]) d = da.from_array(x, chunks=2) weights = np.array([1, 2, 1, 0.5, 1]) dweights = da.from_array(weights, chunks=2) e = da.bincount(d, weights=dweights, minlength=6) assert_eq(e, np.bincount(x, weights=dweights.compute(), minlength=6)) assert same_keys(da.bincount(d, weights=dweights, minlength=6), e)
def test_where_dispatching(self): a = np.arange(10) b = a > 3 x = da.from_array(a, 5) y = da.from_array(b, 5) expected = DataArray(a).where(b) self.assertLazyAndIdentical(expected, DataArray(a).where(y)) self.assertLazyAndIdentical(expected, DataArray(x).where(b)) self.assertLazyAndIdentical(expected, DataArray(x).where(y))
def get_dataset(self, key, info, out=None, xslice=None, yslice=None): """Get the dataset designated by *key*.""" if key.name in ['solar_zenith_angle', 'solar_azimuth_angle', 'satellite_zenith_angle', 'satellite_azimuth_angle']: if key.name == 'solar_zenith_angle': var = self.sd.select('SolarZenith') if key.name == 'solar_azimuth_angle': var = self.sd.select('SolarAzimuth') if key.name == 'satellite_zenith_angle': var = self.sd.select('SensorZenith') if key.name == 'satellite_azimuth_angle': var = self.sd.select('SensorAzimuth') data = xr.DataArray(from_sds(var, chunks=CHUNK_SIZE), dims=['y', 'x']).astype(np.float32) data = data.where(data != var._FillValue) data = data * np.float32(var.scale_factor) data.attrs = info return data if key.name not in ['longitude', 'latitude']: return if (self.cache[key.resolution]['lons'] is None or self.cache[key.resolution]['lats'] is None): lons_id = DatasetID('longitude', resolution=key.resolution) lats_id = DatasetID('latitude', resolution=key.resolution) lons, lats = self.load( [lons_id, lats_id], interpolate=False, raw=True) if key.resolution != self.resolution: from geotiepoints.geointerpolator import GeoInterpolator lons, lats = self._interpolate([lons, lats], self.resolution, lons_id.resolution, GeoInterpolator) lons = np.ma.masked_invalid(np.ascontiguousarray(lons)) lats = np.ma.masked_invalid(np.ascontiguousarray(lats)) self.cache[key.resolution]['lons'] = lons self.cache[key.resolution]['lats'] = lats if key.name == 'latitude': data = self.cache[key.resolution]['lats'].filled(np.nan) data = xr.DataArray(da.from_array(data, chunks=(CHUNK_SIZE, CHUNK_SIZE)), dims=['y', 'x']) else: data = self.cache[key.resolution]['lons'].filled(np.nan) data = xr.DataArray(da.from_array(data, chunks=(CHUNK_SIZE, CHUNK_SIZE)), dims=['y', 'x']) data.attrs = info return data
def test_choice(): np_dtype = np.random.choice(1, size=()).dtype size = (10, 3) chunks = 4 x = da.random.choice(3, size=size, chunks=chunks) assert x.dtype == np_dtype assert x.shape == size res = x.compute() assert res.dtype == np_dtype assert res.shape == size np_a = np.array([1, 3, 5, 7, 9], dtype='f8') da_a = da.from_array(np_a, chunks=2) for a in [np_a, da_a]: x = da.random.choice(a, size=size, chunks=chunks) res = x.compute() assert x.dtype == np_a.dtype assert res.dtype == np_a.dtype assert set(np.unique(res)).issubset(np_a) np_p = np.array([0, 0.2, 0.2, 0.3, 0.3]) da_p = da.from_array(np_p, chunks=2) for a, p in [(da_a, np_p), (np_a, da_p)]: x = da.random.choice(a, size=size, chunks=chunks, p=p) res = x.compute() assert x.dtype == np_a.dtype assert res.dtype == np_a.dtype assert set(np.unique(res)).issubset(np_a[1:]) np_dtype = np.random.choice(1, size=(), p=np.array([1])).dtype x = da.random.choice(5, size=size, chunks=chunks, p=np_p) res = x.compute() assert x.dtype == np_dtype assert res.dtype == np_dtype errs = [(-1, None), # negative a (np_a[:, None], None), # a must be 1D (np_a, np_p[:, None]), # p must be 1D (np_a, np_p[:-2]), # a and p must match (3, np_p), # a and p must match (4, [0.2, 0.2, 0.3])] # p must sum to 1 for (a, p) in errs: with pytest.raises(ValueError): da.random.choice(a, size=size, chunks=chunks, p=p) with pytest.raises(NotImplementedError): da.random.choice(da_a, size=size, chunks=chunks, replace=False) # Want to make sure replace=False works for a single-partition output array x = da.random.choice(da_a, size=da_a.shape[0], chunks=-1, replace=False) res = x.compute() assert len(res) == len(np.unique(res))
def test_tile_array_reps(shape, chunks, reps): x = np.random.random(shape) d = da.from_array(x, chunks=chunks) with pytest.raises(NotImplementedError): da.tile(d, reps)
and computations of a source dask array and display the results in napari. When the computation takes one or more parameters, one can tie a UI to them using magicgui. """ import numpy as np import napari import dask.array as da from dask.array.lib.stride_tricks import sliding_window_view from skimage import data ############################################################################## # Part 1: using code to view a specific value. blobs = data.binary_blobs(length=64, n_dim=3) blobs_dask = da.from_array(blobs, chunks=(1, 64, 64)) # original shape [60, 1, 1, 5, 64, 64], # use squeeze to remove singleton axes blobs_dask_windows = np.squeeze( sliding_window_view(blobs_dask, window_shape=(5, 64, 64)), axis=(1, 2), ) blobs_sum = np.sum(blobs_dask_windows, axis=1) viewer = napari.view_image(blobs_sum) if __name__ == '__main__': napari.run() ############################################################################## # Part 2: using magicgui to vary the slice thickness.
def compute_capacity_factors(tech_points_dict: Dict[str, List[Tuple[float, float]]], spatial_res: float, timestamps: pd.DatetimeIndex, precision: int = 3, smooth_wind_power_curve: bool = True) -> pd.DataFrame: """ Compute capacity factors for a list of points associated to a list of technologies. Parameters ---------- tech_points_dict : Dict[str, List[Tuple[float, float]]] Dictionary associating to each tech a list of points. spatial_res: float Spatial resolution of coordinates timestamps: pd.DatetimeIndex Time stamps for which we want capacity factors precision: int (default: 3) Indicates at which decimal capacity factors should be rounded smooth_wind_power_curve : boolean (default True) If "True", the transfer function of wind assets replicates the one of a wind farm, rather than one of a wind turbine. Returns ------- cap_factor_df : pd.DataFrame DataFrame storing capacity factors for each technology and each point """ for tech, points in tech_points_dict.items(): assert len(points) != 0, f"Error: No points were defined for tech {tech}" assert len(timestamps) != 0, f"Error: No timestamps were defined." # Get the converters corresponding to the input technologies # Dictionary indicating for each technology which converter(s) to use. # For each technology in the dictionary: # - if it is pv-based, the name of the converter must be specified as a string # - if it is wind, a dictionary must be defined associated for the four wind regimes # defined below (I, II, III, IV), the name of the converter as a string converters_dict = get_config_dict(list(tech_points_dict.keys()), ["converter"]) vres_profiles_dir = f"{data_path}generation/vres/profiles/source/" transfer_function_dir = f"{vres_profiles_dir}transfer_functions/" data_converter_wind = pd.read_csv(f"{transfer_function_dir}data_wind_turbines.csv", sep=';', index_col=0) data_converter_pv = pd.read_csv(f"{transfer_function_dir}data_pv_modules.csv", sep=';', index_col=0) dataset = read_resource_database(spatial_res).sel(time=timestamps) # Create output dataframe with MultiIndex (tech, coords) tech_points_tuples = sorted([(tech, point[0], point[1]) for tech, points in tech_points_dict.items() for point in points]) cap_factor_df = pd.DataFrame(index=timestamps, columns=pd.MultiIndex.from_tuples(tech_points_tuples, names=['technologies', 'lon', 'lat']), dtype=float) for tech in tech_points_dict.keys(): resource = get_config_values(tech, ["plant"]) # Round points at the given resolution non_rounded_points = tech_points_dict[tech] rounded_points = [(round(point[0] / spatial_res) * spatial_res, round(point[1] / spatial_res) * spatial_res) for point in non_rounded_points] non_rounded_to_rounded_dict = dict(zip(non_rounded_points, rounded_points)) sub_dataset = dataset.sel(locations=sorted(list(set(rounded_points)))) if resource == 'Wind': wind_speed_reference_height = 100. roughness = sub_dataset.fsr # Compute wind speed for the all the coordinates wind = xu.sqrt(sub_dataset.u100 ** 2 + sub_dataset.v100 ** 2) wind_mean = wind.mean(dim='time') # Split according to the IEC 61400 WTG classes wind_classes = {'IV': [0., 6.5], 'III': [6.5, 8.], 'II': [8., 9.5], 'I': [9.5, 99.]} list_df_per_wind_class = [] for cls in wind_classes: filtered_wind_data = wind_mean.where((wind_mean.data >= wind_classes[cls][0]) & (wind_mean.data < wind_classes[cls][1]), 0) coords_classes = filtered_wind_data[da.nonzero(filtered_wind_data)].locations.values.tolist() if len(coords_classes) > 0: wind_filtered = wind.sel(locations=coords_classes) roughness_filtered = roughness.sel(locations=coords_classes) # Get the transfer function curve # literal_eval converts a string to an array (in this case) converter = converters_dict[tech]["converter"][cls] power_curve_array = literal_eval(data_converter_wind.loc['Power curve', converter]) wind_speed_references = np.asarray([i[0] for i in power_curve_array]) capacity_factor_references = np.asarray([i[1] for i in power_curve_array]) capacity_factor_references_pu = capacity_factor_references / max(capacity_factor_references) wind_log = windpowerlib.wind_speed.logarithmic_profile( wind_filtered.values, wind_speed_reference_height, float(data_converter_wind.loc['Hub height [m]', converter]), roughness_filtered.values) wind_data = da.from_array(wind_log, chunks='auto', asarray=True) # The transfer function of wind assets replicates the one of a # wind farm rather than one of a wind turbine. if smooth_wind_power_curve: turbulence_intensity = wind_filtered.std(dim='time') / wind_filtered.mean(dim='time') capacity_factor_farm = windpowerlib.power_curves.smooth_power_curve( pd.Series(wind_speed_references), pd.Series(capacity_factor_references_pu), standard_deviation_method='turbulence_intensity', turbulence_intensity=float(turbulence_intensity.min().values), wind_speed_range=10.0) power_output = da.map_blocks(np.interp, wind_data, capacity_factor_farm['wind_speed'].values, capacity_factor_farm['value'].values).compute() else: power_output = da.map_blocks(np.interp, wind_data, wind_speed_references, capacity_factor_references_pu).compute() # Convert rounded point back into non-rounded points power_output_df = pd.DataFrame(power_output, columns=coords_classes) coords_classes_rounded = [non_rounded_to_rounded_dict[point] for point in non_rounded_points] power_output_corrected = [power_output_df[point].values for point in coords_classes_rounded if point in power_output_df.columns] coords_classes_non_rounded = [point for point in non_rounded_to_rounded_dict if non_rounded_to_rounded_dict[point] in power_output_df.columns] tech_points_tuples = [(lon, lat) for lon, lat in coords_classes_non_rounded] df_per_wind_class = pd.DataFrame(np.array(power_output_corrected).T, index=timestamps, columns=tech_points_tuples) list_df_per_wind_class.append(df_per_wind_class) else: continue cap_factor_df_concat = pd.concat(list_df_per_wind_class, axis=1) cap_factor_df[tech] = cap_factor_df_concat.reindex(sorted(cap_factor_df_concat.columns), axis=1) elif resource == 'PV': converter = converters_dict[tech]["converter"] # Get irradiance in W from J irradiance = sub_dataset.ssrd / 3600. # Get temperature in C from K temperature = sub_dataset.t2m - 273.15 # Homer equation here: # https://www.homerenergy.com/products/pro/docs/latest/how_homer_calculates_the_pv_array_power_output.html # https://enphase.com/sites/default/files/Enphase_PVWatts_Derate_Guide_ModSolar_06-2014.pdf power_output = (float(data_converter_pv.loc['f', converter]) * (irradiance/float(data_converter_pv.loc['G_ref', converter])) * (1. + float(data_converter_pv.loc['k_P [%/C]', converter])/100. * (temperature - float(data_converter_pv.loc['t_ref', converter])))) power_output = np.array(power_output) # Convert rounded point back into non rounded points power_output_df = pd.DataFrame(power_output, columns=sub_dataset.locations.values.tolist()) coords_classes_rounded = [non_rounded_to_rounded_dict[point] for point in non_rounded_points] power_output_corrected = [power_output_df[point].values for point in coords_classes_rounded if point in power_output_df.columns] cap_factor_df[tech] = np.array(power_output_corrected).T else: raise ValueError(' Profiles for the specified resource is not available yet.') # Check that we do not have NANs assert cap_factor_df.isna().to_numpy().sum() == 0, "Some capacity factors are not available." # Decrease precision of capacity factors cap_factor_df = cap_factor_df.round(precision) return cap_factor_df
tmpa_hdf_file = os.path.join(cfun.tmpa_dir, 'data_tmpa_world_daily.hdf5') else: print('main_evd_maps ERROR:: must specify a valid domain!') # read dask array with all daily precipitation data f = h5py.File(tmpa_hdf_file, "r") print(list(f.keys())) tmpalat = f['lat'][:] tmpalon = f['lon'][:] nlat = np.size(tmpalat) nlon = np.size(tmpalon) dates_int = f['dates'][:] # hours_int = f['hours'][:] dset = f['prcp'] # print('dataset shape = {}'.format(dset.shape)) x = da.from_array(dset, chunks=(6, 6, 300)) # UTC time dates = [datetime.strptime(str(integd), '%Y%m%d') for integd in dates_int] xconus = xr.DataArray(x, coords={ 'lon': tmpalon, 'lat': tmpalat, 'time': dates }, dims=('lon', 'lat', 'time')) xconus = xconus.where(xconus >= -0.001) ### end reading prcp dataset ### # for each grid cell do the following: ntr = np.size(TR) Fi = 1 - 1 / TR
def hdfgroup2signaldict(group, lazy=False): global current_file_version global default_version if current_file_version < LooseVersion("1.2"): metadata = "mapped_parameters" original_metadata = "original_parameters" else: metadata = "metadata" original_metadata = "original_metadata" exp = { 'metadata': hdfgroup2dict(group[metadata], lazy=lazy), 'original_metadata': hdfgroup2dict(group[original_metadata], lazy=lazy), 'attributes': {} } data = group['data'] if lazy: data = da.from_array(data, chunks=data.chunks) exp['attributes']['_lazy'] = True else: data = np.asanyarray(data) exp['data'] = data axes = [] for i in range(len(exp['data'].shape)): try: axes.append(dict(group['axis-%i' % i].attrs)) axis = axes[-1] for key, item in axis.items(): if isinstance(item, np.bool_): axis[key] = bool(item) else: axis[key] = ensure_unicode(item) except KeyError: break if len(axes) != len(exp['data'].shape): # broke from the previous loop try: axes = [ i for k, i in sorted( iter( hdfgroup2dict(group['_list_' + str(len(exp['data'].shape)) + '_axes'], lazy=lazy).items())) ] except KeyError: raise IOError(not_valid_format) exp['axes'] = axes if 'learning_results' in group.keys(): exp['attributes']['learning_results'] = \ hdfgroup2dict( group['learning_results'], lazy=lazy) if 'peak_learning_results' in group.keys(): exp['attributes']['peak_learning_results'] = \ hdfgroup2dict( group['peak_learning_results'], lazy=lazy) # If the title was not defined on writing the Experiment is # then called __unnamed__. The next "if" simply sets the title # back to the empty string if "General" in exp["metadata"] and "title" in exp["metadata"]["General"]: if '__unnamed__' == exp['metadata']['General']['title']: exp['metadata']["General"]['title'] = '' if current_file_version < LooseVersion("1.1"): # Load the decomposition results written with the old name, # mva_results if 'mva_results' in group.keys(): exp['attributes']['learning_results'] = hdfgroup2dict( group['mva_results'], lazy=lazy) if 'peak_mva_results' in group.keys(): exp['attributes']['peak_learning_results'] = hdfgroup2dict( group['peak_mva_results'], lazy=lazy) # Replace the old signal and name keys with their current names if 'signal' in exp['metadata']: if "Signal" not in exp["metadata"]: exp["metadata"]["Signal"] = {} exp['metadata']["Signal"]['signal_type'] = \ exp['metadata']['signal'] del exp['metadata']['signal'] if 'name' in exp['metadata']: if "General" not in exp["metadata"]: exp["metadata"]["General"] = {} exp['metadata']['General']['title'] = \ exp['metadata']['name'] del exp['metadata']['name'] if current_file_version < LooseVersion("1.2"): if '_internal_parameters' in exp['metadata']: exp['metadata']['_HyperSpy'] = \ exp['metadata']['_internal_parameters'] del exp['metadata']['_internal_parameters'] if 'stacking_history' in exp['metadata']['_HyperSpy']: exp['metadata']['_HyperSpy']["Stacking_history"] = \ exp['metadata']['_HyperSpy']['stacking_history'] del exp['metadata']['_HyperSpy']["stacking_history"] if 'folding' in exp['metadata']['_HyperSpy']: exp['metadata']['_HyperSpy']["Folding"] = \ exp['metadata']['_HyperSpy']['folding'] del exp['metadata']['_HyperSpy']["folding"] if 'Variance_estimation' in exp['metadata']: if "Noise_properties" not in exp["metadata"]: exp["metadata"]["Noise_properties"] = {} exp['metadata']['Noise_properties']["Variance_linear_model"] = \ exp['metadata']['Variance_estimation'] del exp['metadata']['Variance_estimation'] if "TEM" in exp["metadata"]: if "Acquisition_instrument" not in exp["metadata"]: exp["metadata"]["Acquisition_instrument"] = {} exp["metadata"]["Acquisition_instrument"]["TEM"] = \ exp["metadata"]["TEM"] del exp["metadata"]["TEM"] tem = exp["metadata"]["Acquisition_instrument"]["TEM"] if "EELS" in tem: if "dwell_time" in tem: tem["EELS"]["dwell_time"] = tem["dwell_time"] del tem["dwell_time"] if "dwell_time_units" in tem: tem["EELS"]["dwell_time_units"] = tem["dwell_time_units"] del tem["dwell_time_units"] if "exposure" in tem: tem["EELS"]["exposure"] = tem["exposure"] del tem["exposure"] if "exposure_units" in tem: tem["EELS"]["exposure_units"] = tem["exposure_units"] del tem["exposure_units"] if "Detector" not in tem: tem["Detector"] = {} tem["Detector"] = tem["EELS"] del tem["EELS"] if "EDS" in tem: if "Detector" not in tem: tem["Detector"] = {} if "EDS" not in tem["Detector"]: tem["Detector"]["EDS"] = {} tem["Detector"]["EDS"] = tem["EDS"] del tem["EDS"] del tem if "SEM" in exp["metadata"]: if "Acquisition_instrument" not in exp["metadata"]: exp["metadata"]["Acquisition_instrument"] = {} exp["metadata"]["Acquisition_instrument"]["SEM"] = \ exp["metadata"]["SEM"] del exp["metadata"]["SEM"] sem = exp["metadata"]["Acquisition_instrument"]["SEM"] if "EDS" in sem: if "Detector" not in sem: sem["Detector"] = {} if "EDS" not in sem["Detector"]: sem["Detector"]["EDS"] = {} sem["Detector"]["EDS"] = sem["EDS"] del sem["EDS"] del sem if "Sample" in exp["metadata"] and "Xray_lines" in exp["metadata"][ "Sample"]: exp["metadata"]["Sample"]["xray_lines"] = exp["metadata"][ "Sample"]["Xray_lines"] del exp["metadata"]["Sample"]["Xray_lines"] for key in ["title", "date", "time", "original_filename"]: if key in exp["metadata"]: if "General" not in exp["metadata"]: exp["metadata"]["General"] = {} exp["metadata"]["General"][key] = exp["metadata"][key] del exp["metadata"][key] for key in ["record_by", "signal_origin", "signal_type"]: if key in exp["metadata"]: if "Signal" not in exp["metadata"]: exp["metadata"]["Signal"] = {} exp["metadata"]["Signal"][key] = exp["metadata"][key] del exp["metadata"][key] if current_file_version < LooseVersion("3.0"): if "Acquisition_instrument" in exp["metadata"]: # Move tilt_stage to Stage.tilt_a # Move exposure time to Detector.Camera.exposure_time if "TEM" in exp["metadata"]["Acquisition_instrument"]: tem = exp["metadata"]["Acquisition_instrument"]["TEM"] exposure = None if "tilt_stage" in tem: tem["Stage"] = {"tilt_a": tem["tilt_stage"]} del tem["tilt_stage"] if "exposure" in tem: exposure = "exposure" # Digital_micrograph plugin was parsing to 'exposure_time' # instead of 'exposure': need this to be compatible with # previous behaviour if "exposure_time" in tem: exposure = "exposure_time" if exposure is not None: if "Detector" not in tem: tem["Detector"] = { "Camera": { "exposure": tem[exposure] } } tem["Detector"]["Camera"] = {"exposure": tem[exposure]} del tem[exposure] # Move tilt_stage to Stage.tilt_a if "SEM" in exp["metadata"]["Acquisition_instrument"]: sem = exp["metadata"]["Acquisition_instrument"]["SEM"] if "tilt_stage" in sem: sem["Stage"] = {"tilt_a": sem["tilt_stage"]} del sem["tilt_stage"] return exp
def test_tile(shape, chunks, reps): x = np.random.random(shape) d = da.from_array(x, chunks=chunks) assert_eq(np.tile(x, reps), da.tile(d, reps))
def test_tril_triu_non_square_arrays(): A = np.random.randint(0, 11, (30, 35)) dA = da.from_array(A, chunks=(5, 5)) assert_eq(da.triu(dA), np.triu(A)) assert_eq(da.tril(dA), np.tril(A))
def hdfgroup2dict(group, dictionary=None, lazy=False): if dictionary is None: dictionary = {} for key, value in group.attrs.items(): if isinstance(value, bytes): value = value.decode() if isinstance(value, (np.string_, str)): if value == '_None_': value = None elif isinstance(value, np.bool_): value = bool(value) elif isinstance(value, np.ndarray) and value.dtype.char == "S": # Convert strings to unicode value = value.astype("U") if value.dtype.str.endswith("U1"): value = value.tolist() # skip signals - these are handled below. if key.startswith('_sig_'): pass elif key.startswith('_list_empty_'): dictionary[key[len('_list_empty_'):]] = [] elif key.startswith('_tuple_empty_'): dictionary[key[len('_tuple_empty_'):]] = () elif key.startswith('_bs_'): dictionary[key[len('_bs_'):]] = value.tostring() # The following two elif stataments enable reading date and time from # v < 2 of HyperSpy's metadata specifications elif key.startswith('_datetime_date'): date_iso = datetime.date( *ast.literal_eval(value[value.index("("):])).isoformat() dictionary[key.replace("_datetime_", "")] = date_iso elif key.startswith('_datetime_time'): date_iso = datetime.time( *ast.literal_eval(value[value.index("("):])).isoformat() dictionary[key.replace("_datetime_", "")] = date_iso else: dictionary[key] = value if not isinstance(group, h5py.Dataset): for key in group.keys(): if key.startswith('_sig_'): from hyperspy.io import dict2signal dictionary[key[len('_sig_'):]] = (dict2signal( hdfgroup2signaldict(group[key], lazy=lazy))) elif isinstance(group[key], h5py.Dataset): dat = group[key] kn = key if key.startswith("_list_"): ans = np.array(dat) ans = ans.tolist() kn = key[6:] elif key.startswith("_tuple_"): ans = np.array(dat) ans = tuple(ans.tolist()) kn = key[7:] elif dat.dtype.char == "S": ans = np.array(dat) try: ans = ans.astype("U") except UnicodeDecodeError: # There are some strings that must stay in binary, # for example dill pickles. This will obviously also # let "wrong" binary string fail somewhere else... pass elif lazy: ans = da.from_array(dat, chunks=dat.chunks) else: ans = np.array(dat) dictionary[kn] = ans elif key.startswith('_hspy_AxesManager_'): dictionary[key[len('_hspy_AxesManager_'):]] = AxesManager([ i for k, i in sorted( iter(hdfgroup2dict(group[key], lazy=lazy).items())) ]) elif key.startswith('_list_'): dictionary[key[7 + key[6:].find('_'):]] = \ [i for k, i in sorted(iter( hdfgroup2dict( group[key], lazy=lazy).items() ))] elif key.startswith('_tuple_'): dictionary[key[8 + key[7:].find('_'):]] = tuple([ i for k, i in sorted( iter(hdfgroup2dict(group[key], lazy=lazy).items())) ]) else: dictionary[key] = {} hdfgroup2dict(group[key], dictionary[key], lazy=lazy) return dictionary
def test_reductions(): x = np.random.random((20, 20)) a = da.from_array(x, blockshape=(7, 7)) assert eq(a.argmin(axis=1), x.argmin(axis=1)) assert eq(a.argmax(axis=0), x.argmax(axis=0))
def apply_multiplepoints(self, trav, dist=None, G0=None, nfft=None, rtm=False, greens=False, dottest=False, **kwargs_cgls): r"""Marchenko redatuming for multiple points Solve the Marchenko redatuming inverse problem for multiple points given their direct arrival traveltime curves (``trav``) and waveforms (``G0``). Parameters ---------- trav : :obj:`numpy.ndarray` Traveltime of first arrival from subsurface points to surface receivers of size :math:`[n_r \times n_{vs}]` dist: :obj:`numpy.ndarray`, optional Distance between subsurface point to surface receivers of size :math:`[n_r \times n_{vs}]` (if provided the analytical direct arrival will be computed using a 3d formulation) G0 : :obj:`numpy.ndarray`, optional Direct arrival in time domain of size :math:`[n_r \times n_{vs} \times n_t]` (if None, create arrival using ``trav``) nfft : :obj:`int`, optional Number of samples in fft when creating the analytical direct wave rtm : :obj:`bool`, optional Compute and return rtm redatuming greens : :obj:`bool`, optional Compute and return Green's functions dottest : :obj:`bool`, optional Apply dot-test **kwargs_cgls Arbitrary keyword arguments for :py:func:`pylops_distributed.optimization.cg.cgls` solver Returns ------- f1_inv_minus : :obj:`numpy.ndarray` Inverted upgoing focusing function of size :math:`[n_r \times n_{vs} \times n_t]` f1_inv_plus : :obj:`numpy.ndarray` Inverted downgoing focusing functionof size :math:`[n_r \times n_{vs} \times n_t]` p0_minus : :obj:`numpy.ndarray` Single-scattering standard redatuming upgoing Green's function of size :math:`[n_r \times n_{vs} \times n_t]` g_inv_minus : :obj:`numpy.ndarray` Inverted upgoing Green's function of size :math:`[n_r \times n_{vs} \times n_t]` g_inv_plus : :obj:`numpy.ndarray` Inverted downgoing Green's function of size :math:`[n_r \times n_{vs} \times n_t]` """ nvs = trav.shape[1] # Create window trav_off = trav - self.toff trav_off = np.round(trav_off / self.dt).astype(np.int) w = np.zeros((self.nr, nvs, self.nt), dtype=self.dtype) for ir in range(self.nr): for ivs in range(nvs): w[ir, ivs, :trav_off[ir, ivs]] = 1 w = np.concatenate((np.flip(w, axis=-1), w[:, :, 1:]), axis=-1) if self.nsmooth > 0: smooth = np.ones(self.nsmooth) / self.nsmooth w = filtfilt(smooth, 1, w) w = w.astype(self.dtype) # Create operators Rop = MDC(self.Rtwosided_fft, self.nt2, nv=nvs, dt=self.dt, dr=self.dr, twosided=True, conj=False, saveGt=self.saveRt, prescaled=self.prescaled) R1op = MDC(self.Rtwosided_fft, self.nt2, nv=nvs, dt=self.dt, dr=self.dr, twosided=True, conj=True, saveGt=self.saveRt, prescaled=self.prescaled) Rollop = Roll(self.ns * nvs * self.nt2, dims=(self.nt2, self.ns, nvs), dir=0, shift=-1, dtype=self.dtype) Wop = Diagonal(da.from_array(w.transpose(2, 0, 1).flatten()), dtype=self.dtype) Iop = Identity(self.nr * nvs * self.nt2, dtype=self.dtype) Mop = Block([[Iop, -1 * Wop * Rop], [-1 * Wop * Rollop * R1op, Iop]]) * BlockDiag([Wop, Wop]) Gop = Block([[Iop, -1 * Rop], [-1 * Rollop * R1op, Iop]]) if dottest: Dottest(Gop, 2 * self.nr * nvs * self.nt2, 2 * self.nr * nvs * self.nt2, chunks=(2 * self.ns * nvs * self.nt2, 2 * self.nr * nvs * self.nt2), raiseerror=True, verb=True) if dottest: Dottest(Mop, 2 * self.ns * nvs * self.nt2, 2 * self.nr * nvs * self.nt2, chunks=(2 * self.ns * nvs * self.nt2, 2 * self.nr * nvs * self.nt2), raiseerror=True, verb=True) # Create input focusing function if G0 is None: if self.wav is not None and nfft is not None: G0 = np.zeros((self.nr, nvs, self.nt), dtype=self.dtype) for ivs in range(nvs): G0[:, ivs] = (directwave(self.wav, trav[:, ivs], self.nt, self.dt, nfft=nfft, derivative=True, dist=dist, kind='2d' if dist is None else '3d')).T else: logging.error('wav and/or nfft are not provided. ' 'Provide either G0 or wav and nfft...') raise ValueError('wav and/or nfft are not provided. ' 'Provide either G0 or wav and nfft...') G0 = G0.astype(self.dtype) fd_plus = np.concatenate((np.flip(G0, axis=-1).transpose(2, 0, 1), np.zeros((self.nt - 1, self.nr, nvs), dtype=self.dtype))) fd_plus = da.from_array(fd_plus).rechunk(fd_plus.shape) # Run standard redatuming as benchmark if rtm: p0_minus = Rop * fd_plus.flatten() p0_minus = p0_minus.reshape(self.nt2, self.ns, nvs).transpose(1, 2, 0) # Create data and inverse focusing functions d = Wop * Rop * fd_plus.flatten() d = da.concatenate((d.reshape(self.nt2, self.ns, nvs), da.zeros((self.nt2, self.ns, nvs), dtype=self.dtype))) # Invert for focusing functions f1_inv = cgls(Mop, d.flatten(), **kwargs_cgls)[0] f1_inv = f1_inv.reshape(2 * self.nt2, self.nr, nvs) f1_inv_tot = \ f1_inv + da.concatenate((da.zeros((self.nt2, self.nr, nvs), dtype=self.dtype), fd_plus)) if greens: # Create Green's functions g_inv = Gop * f1_inv_tot.flatten() g_inv = g_inv.reshape(2 * self.nt2, self.ns, nvs) # Compute if rtm and greens: d, p0_minus, f1_inv_tot, g_inv = \ da.compute(d, p0_minus, f1_inv_tot, g_inv) elif rtm: d, p0_minus, f1_inv_tot = \ da.compute(d, p0_minus, f1_inv_tot) elif greens: d, f1_inv_tot, g_inv = \ da.compute(d, f1_inv_tot, g_inv) else: d, f1_inv_tot = \ da.compute(d, f1_inv_tot) # Separate focusing and Green's functions f1_inv_minus = f1_inv_tot[:self.nt2].transpose(1, 2, 0) f1_inv_plus = f1_inv_tot[self.nt2:].transpose(1, 2, 0) if greens: g_inv_minus = -g_inv[:self.nt2].transpose(1, 2, 0) g_inv_plus = np.flip(g_inv[self.nt2:], axis=0).transpose(1, 2, 0) if rtm and greens: return f1_inv_minus, f1_inv_plus, p0_minus, g_inv_minus, g_inv_plus elif rtm: return f1_inv_minus, f1_inv_plus, p0_minus elif greens: return f1_inv_minus, f1_inv_plus, g_inv_minus, g_inv_plus else: return f1_inv_minus, f1_inv_plus
def test_confusion_matrix_normalize(normalize, expected_results, client): y_test = da.from_array(cp.array([0, 1, 2] * 6)) y_pred = da.from_array(cp.array(list(chain(*permutations([0, 1, 2]))))) cm = confusion_matrix(y_test, y_pred, normalize=normalize) cp.testing.assert_allclose(cm, cp.array(expected_results))
## Kinematics #branches = ["pho_pT", "pho_E", "pho_eta", "pho_phi"] #X_p4 = da.concatenate([\ # da.from_delayed(\ # load_single(tree,i,i+chunk_size, branches),\ # shape=(chunk_size,len(branches)),\ # dtype=np.float32)\ # for i in range(0,neff,chunk_size)]) #print " >> Expected shape:", X_p4.shape # Class label label = j label = 0 print " >> Class label:", label y = da.from_array(\ np.full(X.shape[0], label, dtype=np.float32),\ chunks=(chunk_size,)) #file_out_str = "%s/%s_IMG_RH%d_n%dk_label%d.hdf5"%(eosDir,decay,int(scale),neff//1000.,label) file_out_str = "%s/%s_IMGcrop_RH%d_n%dkx2.hdf5" % ( eosDir, decay, int(scale), neff // 1000.) #file_out_str = "test.hdf5" print " >> Writing to:", file_out_str #da.to_hdf5(file_out_str, {'/X': X, '/y': y, 'eventId': eventId, 'X_crop0': X_crop0, 'X_crop1': X_crop1}, compression='lzf') da.to_hdf5( file_out_str, { #'/X': X, '/y': y, #'eventId': eventId, 'X_crop0': X_crop0,
def make_low_rank_matrix(n_samples=100, n_features=100, effective_rank=10, tail_strength=0.5, random_state=None, n_parts=1, n_samples_per_part=None, dtype='float32'): """ Generate a mostly low rank matrix with bell-shaped singular values Parameters ---------- n_samples : int, optional (default=100) The number of samples. n_features : int, optional (default=100) The number of features. effective_rank : int, optional (default=10) The approximate number of singular vectors required to explain most of the data by linear combinations. tail_strength : float between 0.0 and 1.0, optional (default=0.5) The relative importance of the fat noisy tail of the singular values profile. random_state : int, CuPy RandomState instance, Dask RandomState instance \ or None (default) Determines random number generation for dataset creation. Pass an int for reproducible output across multiple function calls. n_parts : int, optional (default=1) The number of parts of work. dtype: str, optional (default='float32') dtype of generated data Returns ------- X : Dask-CuPy array of shape [n_samples, n_features] The matrix. """ rs = _create_rs_generator(random_state) n = min(n_samples, n_features) # Random (ortho normal) vectors m1 = rs.standard_normal( (n_samples, n), chunks=(_generate_chunks_for_qr(n_samples, n, n_parts), -1), dtype=dtype) u, _ = da.linalg.qr(m1) m2 = rs.standard_normal( (n, n_features), chunks=(-1, _generate_chunks_for_qr(n_features, n, n_parts)), dtype=dtype) v, _ = da.linalg.qr(m2) # For final multiplication if n_samples_per_part is None: n_samples_per_part = max(1, int(n_samples / n_parts)) u = u.rechunk({0: n_samples_per_part, 1: -1}) v = v.rechunk({0: n_samples_per_part, 1: -1}) local_s = _generate_singular_values(n, effective_rank, tail_strength, n_samples_per_part) s = da.from_array(local_s, chunks=(int(n_samples_per_part), )) u *= s return da.dot(u, v)
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs): """ Read dask Dataframe from bcolz.ctable Parameters ---------- x : bcolz.ctable Input data chunksize : int (optional) The size of blocks to pull out from ctable. Ideally as large as can comfortably fit in memory categorize : bool (defaults to True) Automatically categorize all string dtypes index : string (optional) Column to make the index See Also -------- from_array: more generic function not optimized for bcolz """ import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names), )) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = (0, ) + tuple(range(-1, len(x), chunksize))[1:] if divisions[-1] != len(x) - 1: divisions = divisions + (len(x) - 1, ) if x.rootdir: token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize, categorize, index, kwargs) else: token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize, index, kwargs) new_name = 'from_bcolz-' + token dsk = dict(((new_name, i), (locked_df_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize), ), columns, categories)) for i in range(0, int(ceil(len(x) / chunksize)))) result = DataFrame(dsk, new_name, columns, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names), )) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = da.percentile(a, q).compute() return set_partition(result, index, divisions, **kwargs) else: return result
def test_slicing_with_Nones(shape, slice): x = np.random.random(shape) d = da.from_array(x, chunks=shape) assert_eq(x[slice], d[slice])
def add_ancillary_datasets(scene, lons, lats, sunz, satz, azidiff, chunks=(512, 3712)): """Add ancillary datasets to the scene. Args: lons: Longitude coordinates lats: Latitude coordinates sunz: Solar zenith angle satz: Satellite zenith angle azidiff: Absolute azimuth difference angle chunks: Chunksize """ start_time = scene['IR_108'].attrs['start_time'] end_time = scene['IR_108'].attrs['end_time'] angle_coords = scene['IR_108'].coords # Latitude scene['lat'] = xr.DataArray(da.from_array(lats, chunks=chunks), dims=['y', 'x'], coords={ 'y': scene['IR_108']['y'], 'x': scene['IR_108']['x'] }) scene['lat'].attrs['long_name'] = 'latitude coordinate' scene['lat'].attrs['standard_name'] = 'latitude' scene['lat'].attrs['units'] = 'degrees_north' scene['lat'].attrs['start_time'] = start_time scene['lat'].attrs['end_time'] = end_time # Longitude scene['lon'] = xr.DataArray(da.from_array(lons, chunks=chunks), dims=['y', 'x'], coords={ 'y': scene['IR_108']['y'], 'x': scene['IR_108']['x'] }) scene['lon'].attrs['long_name'] = 'longitude coordinate' scene['lon'].attrs['standard_name'] = 'longitude' scene['lon'].attrs['units'] = 'degrees_east' scene['lon'].attrs['start_time'] = start_time scene['lon'].attrs['end_time'] = end_time # Sunzenith scene['sunzenith'] = xr.DataArray(da.from_array(sunz[:, :], chunks=chunks), dims=['y', 'x'], coords=angle_coords) # Satzenith scene['satzenith'] = xr.DataArray(da.from_array(satz[:, :], chunks=chunks), dims=['y', 'x'], coords=angle_coords) # Azidiff scene['azimuthdiff'] = xr.DataArray(da.from_array(azidiff[:, :], chunks=chunks), dims=['y', 'x'], coords=angle_coords) # Update the attributes update_angle_attributes(scene, band=scene['IR_108'])
def test_tile_neg_reps(shape, chunks, reps): x = np.random.random(shape) d = da.from_array(x, chunks=chunks) with pytest.raises(ValueError): da.tile(d, reps)
def test_slicing_consistent_names(): x = np.arange(100).reshape((10, 10)) a = da.from_array(x, chunks=(5, 5)) assert same_keys(a[0], a[0]) assert same_keys(a[:, [1, 2, 3]], a[:, [1, 2, 3]]) assert same_keys(a[:, 5:2:-1], a[:, 5:2:-1])
def test_diagonal(): v = np.arange(11) with pytest.raises(ValueError): da.diagonal(v) v = np.arange(4).reshape((2, 2)) with pytest.raises(ValueError): da.diagonal(v, axis1=0, axis2=0) with pytest.raises(AxisError): da.diagonal(v, axis1=-4) with pytest.raises(AxisError): da.diagonal(v, axis2=-4) v = np.arange(4 * 5 * 6).reshape((4, 5, 6)) v = da.from_array(v, chunks=2) assert_eq(da.diagonal(v), np.diagonal(v)) # Empty diagonal. assert_eq(da.diagonal(v, offset=10), np.diagonal(v, offset=10)) assert_eq(da.diagonal(v, offset=-10), np.diagonal(v, offset=-10)) with pytest.raises(ValueError): da.diagonal(v, axis1=-2) # Negative axis. assert_eq(da.diagonal(v, axis1=-1), np.diagonal(v, axis1=-1)) assert_eq(da.diagonal(v, offset=1, axis1=-1), np.diagonal(v, offset=1, axis1=-1)) # Heterogenous chunks. v = np.arange(2 * 3 * 4 * 5 * 6).reshape((2, 3, 4, 5, 6)) v = da.from_array(v, chunks=(1, (1, 2), (1, 2, 1), (2, 1, 2), (5, 1))) assert_eq(da.diagonal(v), np.diagonal(v)) assert_eq( da.diagonal(v, offset=2, axis1=3, axis2=1), np.diagonal(v, offset=2, axis1=3, axis2=1), ) assert_eq( da.diagonal(v, offset=-2, axis1=3, axis2=1), np.diagonal(v, offset=-2, axis1=3, axis2=1), ) assert_eq( da.diagonal(v, offset=-2, axis1=3, axis2=4), np.diagonal(v, offset=-2, axis1=3, axis2=4), ) assert_eq(da.diagonal(v, 1), np.diagonal(v, 1)) assert_eq(da.diagonal(v, -1), np.diagonal(v, -1)) # Positional arguments assert_eq(da.diagonal(v, 1, 2, 1), np.diagonal(v, 1, 2, 1)) v = np.arange(2 * 3 * 4 * 5 * 6).reshape((2, 3, 4, 5, 6)) assert_eq(da.diagonal(v, axis1=1, axis2=3), np.diagonal(v, axis1=1, axis2=3)) assert_eq( da.diagonal(v, offset=1, axis1=1, axis2=3), np.diagonal(v, offset=1, axis1=1, axis2=3), ) assert_eq( da.diagonal(v, offset=1, axis1=3, axis2=1), np.diagonal(v, offset=1, axis1=3, axis2=1), ) assert_eq( da.diagonal(v, offset=-5, axis1=3, axis2=1), np.diagonal(v, offset=-5, axis1=3, axis2=1), ) assert_eq( da.diagonal(v, offset=-6, axis1=3, axis2=1), np.diagonal(v, offset=-6, axis1=3, axis2=1), ) assert_eq( da.diagonal(v, offset=-6, axis1=-3, axis2=1), np.diagonal(v, offset=-6, axis1=-3, axis2=1), ) assert_eq( da.diagonal(v, offset=-6, axis1=-3, axis2=1), np.diagonal(v, offset=-6, axis1=-3, axis2=1), ) v = da.from_array(v, chunks=2) assert_eq( da.diagonal(v, offset=1, axis1=3, axis2=1), np.diagonal(v, offset=1, axis1=3, axis2=1), ) assert_eq( da.diagonal(v, offset=-1, axis1=3, axis2=1), np.diagonal(v, offset=-1, axis1=3, axis2=1), ) v = np.arange(384).reshape((8, 8, 6)) assert_eq(da.diagonal(v, offset=-1, axis1=2), np.diagonal(v, offset=-1, axis1=2)) v = da.from_array(v, chunks=(4, 4, 2)) assert_eq(da.diagonal(v, offset=-1, axis1=2), np.diagonal(v, offset=-1, axis1=2))
def _create_data(objective, n_samples=100, output='array', chunk_size=50, **kwargs): if objective.endswith('classification'): if objective == 'binary-classification': centers = [[-4, -4], [4, 4]] elif objective == 'multiclass-classification': centers = [[-4, -4], [4, 4], [-4, 4]] else: raise ValueError(f"Unknown classification task '{objective}'") X, y = make_blobs(n_samples=n_samples, centers=centers, random_state=42) elif objective == 'regression': X, y = make_regression(n_samples=n_samples, random_state=42) elif objective == 'ranking': return _create_ranking_data(n_samples=n_samples, output=output, chunk_size=chunk_size, **kwargs) else: raise ValueError("Unknown objective '%s'" % objective) rnd = np.random.RandomState(42) weights = rnd.random(X.shape[0]) * 0.01 if output == 'array': dX = da.from_array(X, (chunk_size, X.shape[1])) dy = da.from_array(y, chunk_size) dw = da.from_array(weights, chunk_size) elif output.startswith('dataframe'): X_df = pd.DataFrame( X, columns=['feature_%d' % i for i in range(X.shape[1])]) if output == 'dataframe-with-categorical': num_cat_cols = 5 for i in range(num_cat_cols): col_name = "cat_col" + str(i) cat_values = rnd.choice(['a', 'b'], X.shape[0]) cat_series = pd.Series(cat_values, dtype='category') X_df[col_name] = cat_series X = np.hstack((X, cat_series.cat.codes.values.reshape(-1, 1))) # for the small data sizes used in tests, it's hard to get LGBMRegressor to choose # categorical features for splits. So for regression tests with categorical features, # _create_data() returns a DataFrame with ONLY categorical features if objective == 'regression': cat_cols = [ col for col in X_df.columns if col.startswith('cat_col') ] X_df = X_df[cat_cols] X = X[:, -num_cat_cols:] y_df = pd.Series(y, name='target') dX = dd.from_pandas(X_df, chunksize=chunk_size) dy = dd.from_pandas(y_df, chunksize=chunk_size) dw = dd.from_array(weights, chunksize=chunk_size) elif output == 'scipy_csr_matrix': dX = da.from_array(X, chunks=(chunk_size, X.shape[1])).map_blocks(csr_matrix) dy = da.from_array(y, chunks=chunk_size) dw = da.from_array(weights, chunk_size) else: raise ValueError("Unknown output type '%s'" % output) return X, y, weights, None, dX, dy, dw, None
def test_tril_triu_errors(): A = np.random.randint(0, 11, (10, 10, 10)) dA = da.from_array(A, chunks=(5, 5, 5)) pytest.raises(ValueError, lambda: da.triu(dA))
def test_issignedinf(): arr = np.random.randint(-1, 2, size=(20, 20)).astype(float) / 0 darr = da.from_array(arr, 3) assert_eq(np.isneginf(arr), da.isneginf(darr)) assert_eq(np.isposinf(arr), da.isposinf(darr))
def fit(model, x, y, compute=True, **kwargs): """ Fit scikit learn model against dask arrays Model must support the ``partial_fit`` interface for online or batch learning. This method will be called on dask arrays in sequential order. Ideally your rows are independent and identically distributed. Parameters ---------- model: sklearn model Any model supporting partial_fit interface x: dask Array Two dimensional array, likely tall and skinny y: dask Array One dimensional array with same chunks as x's rows kwargs: options to pass to partial_fit Examples -------- >>> import dask.array as da >>> X = da.random.random((10, 3), chunks=(5, 3)) >>> y = da.random.randint(0, 2, 10, chunks=(5,)) >>> from sklearn.linear_model import SGDClassifier >>> sgd = SGDClassifier() >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0]) >>> sgd # doctest: +SKIP SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) This passes all of X and y through the classifier sequentially. We can use the classifier as normal on in-memory data >>> import numpy as np >>> sgd.predict(np.random.random((4, 3))) # doctest: +SKIP array([1, 0, 0, 1]) Or predict on a larger dataset >>> z = da.random.random((400, 3), chunks=(100, 3)) >>> da.learn.predict(sgd, z) # doctest: +SKIP dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64> """ assert x.ndim == 2 if isinstance(x, np.ndarray): x = da.from_array(x, chunks=x.shape) if isinstance(y, np.ndarray): y = da.from_array(y, chunks=y.shape) if y is not None: assert y.ndim == 1 assert x.chunks[0] == y.chunks[0] assert hasattr(model, 'partial_fit') if len(x.chunks[1]) > 1: x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) nblocks = len(x.chunks[0]) name = 'fit-' + dask.base.tokenize(model, x, y, kwargs) dsk = {(name, -1): model} dsk.update({(name, i): (_partial_fit, (name, i - 1), (x.name, i, 0), (getattr(y, 'name', ''), i), kwargs) for i in range(nblocks)}) new_dsk = dask.sharedict.merge((name, dsk), x.dask, getattr(y, 'dask', {})) value = Delayed((name, nblocks - 1), new_dsk) if compute: return value.compute() else: return value
def main(args): if args.psf_pars is None: print("Attempting to take psf_pars from residual fits header") try: rhdr = fits.getheader(args.residual) except KeyError: raise RuntimeError("Either provide a residual with beam " "information or pass them in using --psf_pars " "argument") if 'BMAJ1' in rhdr.keys(): emaj = rhdr['BMAJ1'] emin = rhdr['BMIN1'] pa = rhdr['BPA1'] gaussparf = (emaj, emin, pa) elif 'BMAJ' in rhdr.keys(): emaj = rhdr['BMAJ'] emin = rhdr['BMIN'] pa = rhdr['BPA'] gaussparf = (emaj, emin, pa) else: gaussparf = tuple(args.psf_pars) if args.circ_psf: e = (gaussparf[0] + gaussparf[1]) / 2.0 gaussparf[0] = e gaussparf[1] = e print("Using emaj = %3.2e, emin = %3.2e, PA = %3.2e \n" % gaussparf) # load model image model = load_fits(args.model, dtype=args.out_dtype) model = model.squeeze() orig_shape = model.shape mhdr = fits.getheader(args.model) l_coord, ref_l = data_from_header(mhdr, axis=1) l_coord -= ref_l m_coord, ref_m = data_from_header(mhdr, axis=2) m_coord -= ref_m if mhdr["CTYPE4"].lower() == 'freq': freq_axis = 4 elif mhdr["CTYPE3"].lower() == 'freq': freq_axis = 3 else: raise ValueError("Freq axis must be 3rd or 4th") mfs_shape = list(orig_shape) mfs_shape[0] = 1 mfs_shape = tuple(mfs_shape) freqs, ref_freq = data_from_header(mhdr, axis=freq_axis) nband = freqs.size if nband < 2: raise ValueError("Can't produce alpha map from a single band image") npix_l = l_coord.size npix_m = m_coord.size # update cube psf-pars for i in range(1, nband + 1): mhdr['BMAJ' + str(i)] = gaussparf[0] mhdr['BMIN' + str(i)] = gaussparf[1] mhdr['BPA' + str(i)] = gaussparf[2] if args.ref_freq is not None and args.ref_freq != ref_freq: ref_freq = args.ref_freq print( 'Provided reference frequency does not match that of fits file. Will overwrite.' ) print("Cube frequencies:") with np.printoptions(precision=2): print(freqs) print("Reference frequency is %3.2e Hz \n" % ref_freq) # LB - new header for cubes if ref_freqs differ new_hdr = set_header_info(mhdr, ref_freq, freq_axis, args, gaussparf) # save next to model if no outfile is provided if args.output_filename is None: # strip .fits from model filename tmp = args.model[::-1] idx = tmp.find('.') outfile = args.model[0:-(idx + 1)] else: outfile = args.output_filename xx, yy = np.meshgrid(l_coord, m_coord, indexing='ij') # load beam if args.beam_model is not None: bhdr = fits.getheader(args.beam_model) l_coord_beam, ref_lb = data_from_header(bhdr, axis=1) l_coord_beam -= ref_lb if not np.array_equal(l_coord_beam, l_coord): raise ValueError( "l coordinates of beam model do not match those of image. Use power_beam_maker to interpolate to fits header." ) m_coord_beam, ref_mb = data_from_header(bhdr, axis=2) m_coord_beam -= ref_mb if not np.array_equal(m_coord_beam, m_coord): raise ValueError( "m coordinates of beam model do not match those of image. Use power_beam_maker to interpolate to fits header." ) freqs_beam, _ = data_from_header(bhdr, axis=freq_axis) if not np.array_equal(freqs, freqs_beam): raise ValueError( "Freqs of beam model do not match those of image. Use power_beam_maker to interpolate to fits header." ) beam_image = load_fits(args.beam_model, dtype=args.out_dtype).reshape(model.shape) else: beam_image = np.ones(model.shape, dtype=args.out_dtype) # do beam correction LB - TODO: use forward model instead beammin = np.amin(beam_image, axis=0)[None, :, :] model = np.where(beammin >= args.pb_min, model / beam_image, 0.0) if not args.dont_convolve: print("Computing clean beam") # convolve model to desired resolution model, gausskern = convolve2gaussres(model, xx, yy, gaussparf, args.ncpu, None, args.padding_frac) # save clean beam if 'c' in args.products: name = outfile + '.clean_psf.fits' save_fits(name, gausskern.reshape(mfs_shape), new_hdr, dtype=args.out_dtype) print("Wrote clean psf to %s \n" % name) # save convolved model if 'm' in args.products: name = outfile + '.convolved_model.fits' save_fits(name, model.reshape(orig_shape), new_hdr, dtype=args.out_dtype) print("Wrote convolved model to %s \n" % name) # add in residuals and set threshold if args.residual is not None: resid = load_fits(args.residual, dtype=args.out_dtype).squeeze() rhdr = fits.getheader(args.residual) l_res, ref_lb = data_from_header(rhdr, axis=1) l_res -= ref_lb if not np.array_equal(l_res, l_coord): raise ValueError( "l coordinates of residual do not match those of model") m_res, ref_mb = data_from_header(rhdr, axis=2) m_res -= ref_mb if not np.array_equal(m_res, m_coord): raise ValueError( "m coordinates of residual do not match those of model") freqs_res, _ = data_from_header(rhdr, axis=freq_axis) if not np.array_equal(freqs, freqs_res): raise ValueError("Freqs of residual do not match those of model") # convolve residual to same resolution as model gausspari = () for i in range(1, nband + 1): key = 'BMAJ' + str(i) if key in rhdr.keys(): emaj = rhdr[key] emin = rhdr['BMIN' + str(i)] pa = rhdr['BPA' + str(i)] gausspari += ((emaj, emin, pa), ) else: print( "Can't find Gausspars in residual header, unable to add residuals back in" ) gausspari = None break if gausspari is not None and args.add_convolved_residuals: resid, _ = convolve2gaussres(resid, xx, yy, gaussparf, args.ncpu, gausspari, args.padding_frac, norm_kernel=True) model += resid print("Convolved residuals added to convolved model") if 'c' in args.products: name = outfile + '.convolved_residual.fits' save_fits(name, resid.reshape(orig_shape), rhdr) print("Wrote convolved residuals to %s" % name) counts = np.sum(resid != 0) rms = np.sqrt(np.sum(resid**2) / counts) rms_cube = np.std(resid.reshape(nband, npix_l * npix_m), axis=1).ravel() threshold = args.threshold * rms print("Setting cutoff threshold as %i times the rms " "of the residual " % args.threshold) del resid else: print("No residual provided. Setting threshold i.t.o dynamic range. " "Max dynamic range is %i " % args.maxDR) threshold = model.max() / args.maxDR rms_cube = None print("Threshold set to %f Jy. \n" % threshold) # get pixels above threshold minimage = np.amin(model, axis=0) maskindices = np.argwhere(minimage > threshold) if not maskindices.size: raise ValueError("No components found above threshold. " "Try lowering your threshold." "Max of convolved model is %3.2e" % model.max()) fitcube = model[:, maskindices[:, 0], maskindices[:, 1]].T # set weights for fit if rms_cube is not None: print("Using RMS in each imaging band to determine weights. \n") weights = np.where(rms_cube > 0, 1.0 / rms_cube**2, 0.0) # normalise weights /= weights.max() else: if args.channel_weights is not None: weights = np.array(args.channel_weights) print("Using provided channel weights \n") else: print( "No residual or channel weights provided. Using equal weights. \n" ) weights = np.ones(nband, dtype=np.float64) ncomps, _ = fitcube.shape fitcube = da.from_array(fitcube.astype(np.float64), chunks=(ncomps // args.ncpu, nband)) weights = da.from_array(weights.astype(np.float64), chunks=(nband)) freqsdask = da.from_array(freqs.astype(np.float64), chunks=(nband)) print("Fitting %i components" % ncomps) alpha, alpha_err, Iref, i0_err = fit_spi_components( fitcube, weights, freqsdask, np.float64(ref_freq)).compute() print("Done. Writing output. \n") alphamap = np.zeros(model[0].shape, dtype=model.dtype) alpha_err_map = np.zeros(model[0].shape, dtype=model.dtype) i0map = np.zeros(model[0].shape, dtype=model.dtype) i0_err_map = np.zeros(model[0].shape, dtype=model.dtype) alphamap[maskindices[:, 0], maskindices[:, 1]] = alpha alpha_err_map[maskindices[:, 0], maskindices[:, 1]] = alpha_err i0map[maskindices[:, 0], maskindices[:, 1]] = Iref i0_err_map[maskindices[:, 0], maskindices[:, 1]] = i0_err if 'I' in args.products: # get the reconstructed cube Irec_cube = i0map[None, :, :] * \ (freqs[:, None, None]/ref_freq)**alphamap[None, :, :] name = outfile + '.Irec_cube.fits' save_fits(name, Irec_cube.reshape(orig_shape), mhdr, dtype=args.out_dtype) print("Wrote reconstructed cube to %s" % name) # save alpha map if 'a' in args.products: name = outfile + '.alpha.fits' save_fits(name, alphamap.reshape(mfs_shape), mhdr, dtype=args.out_dtype) print("Wrote alpha map to %s" % name) # save alpha error map if 'e' in args.products: name = outfile + '.alpha_err.fits' save_fits(name, alpha_err_map.reshape(mfs_shape), mhdr, dtype=args.out_dtype) print("Wrote alpha error map to %s" % name) # save I0 map if 'i' in args.products: name = outfile + '.I0.fits' save_fits(name, i0map.reshape(mfs_shape), mhdr, dtype=args.out_dtype) print("Wrote I0 map to %s" % name) # save I0 error map if 'k' in args.products: name = outfile + '.I0_err.fits' save_fits(name, i0_err_map.reshape(mfs_shape), mhdr, dtype=args.out_dtype) print("Wrote I0 error map to %s" % name) print(' \n ') print("All done here")
assert hasattr(gs, "dask_graph_") with tmpdir() as d: gs.visualize(filename=os.path.join(d, "mydask")) assert os.path.exists(os.path.join(d, "mydask.png")) # Doesn't work if not fitted gs = dcv.GridSearchCV(clf, grid) with pytest.raises(NotFittedError): gs.visualize() np_X = np.random.normal(size=(20, 3)) np_y = np.random.randint(2, size=20) np_groups = np.random.permutation(list(range(5)) * 4) da_X = da.from_array(np_X, chunks=(3, 3)) da_y = da.from_array(np_y, chunks=3) da_groups = da.from_array(np_groups, chunks=3) del_X = delayed(np_X) del_y = delayed(np_y) del_groups = delayed(np_groups) @pytest.mark.parametrize( ["cls", "has_shuffle"], [ (KFold, True), (GroupKFold, False), (StratifiedKFold, True), (TimeSeriesSplit, False), ],
def test_multiple_list_slicing(): x = np.random.rand(6, 7, 8) a = da.from_array(x, chunks=(3, 3, 3)) assert_eq(x[:, [0, 1, 2]][[0, 1]], a[:, [0, 1, 2]][[0, 1]])
def test_confusion_matrix_binary(client, chunks): y_true = da.from_array(cp.array([0, 1, 0, 1]), chunks=chunks) y_pred = da.from_array(cp.array([1, 1, 1, 0]), chunks=chunks) tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() ref = cp.array([0, 2, 1, 1]) cp.testing.assert_array_equal(ref, cp.array([tn, fp, fn, tp]))
def test_confusion_matrix(client, chunks): y_true = da.from_array(cp.array([2, 0, 2, 2, 0, 1]), chunks=chunks) y_pred = da.from_array(cp.array([0, 0, 2, 2, 0, 2]), chunks=chunks) cm = confusion_matrix(y_true, y_pred) ref = cp.array([[2, 0, 0], [0, 0, 1], [1, 0, 2]]) cp.testing.assert_array_equal(cm, ref)
def read_band(self, key, info): """Read the data.""" tic = datetime.now() header = {} with open(self.filename, "rb") as fp_: header['block1'] = np.fromfile(fp_, dtype=_BASIC_INFO_TYPE, count=1) header["block2"] = np.fromfile(fp_, dtype=_DATA_INFO_TYPE, count=1) header["block3"] = np.fromfile(fp_, dtype=_PROJ_INFO_TYPE, count=1) header["block4"] = np.fromfile(fp_, dtype=_NAV_INFO_TYPE, count=1) header["block5"] = np.fromfile(fp_, dtype=_CAL_INFO_TYPE, count=1) logger.debug("Band number = " + str(header["block5"]['band_number'][0])) logger.debug('Time_interval: %s - %s', str(self.start_time), str(self.end_time)) band_number = header["block5"]['band_number'][0] if band_number < 7: cal = np.fromfile(fp_, dtype=_VISCAL_INFO_TYPE, count=1) else: cal = np.fromfile(fp_, dtype=_IRCAL_INFO_TYPE, count=1) header['calibration'] = cal header["block6"] = np.fromfile(fp_, dtype=_INTER_CALIBRATION_INFO_TYPE, count=1) header["block7"] = np.fromfile(fp_, dtype=_SEGMENT_INFO_TYPE, count=1) header["block8"] = np.fromfile( fp_, dtype=_NAVIGATION_CORRECTION_INFO_TYPE, count=1) # 8 The navigation corrections: ncorrs = header["block8"]['numof_correction_info_data'][0] dtype = np.dtype([ ("line_number_after_rotation", "<u2"), ("shift_amount_for_column_direction", "f4"), ("shift_amount_for_line_direction", "f4"), ]) corrections = [] for i in range(ncorrs): corrections.append(np.fromfile(fp_, dtype=dtype, count=1)) fp_.seek(40, 1) header['navigation_corrections'] = corrections header["block9"] = np.fromfile(fp_, dtype=_OBS_TIME_INFO_TYPE, count=1) numobstimes = header["block9"]['number_of_observation_times'][0] dtype = np.dtype([ ("line_number", "<u2"), ("observation_time", "f8"), ]) lines_and_times = [] for i in range(numobstimes): lines_and_times.append(np.fromfile(fp_, dtype=dtype, count=1)) header['observation_time_information'] = lines_and_times fp_.seek(40, 1) header["block10"] = np.fromfile(fp_, dtype=_ERROR_INFO_TYPE, count=1) dtype = np.dtype([ ("line_number", "<u2"), ("numof_error_pixels_per_line", "<u2"), ]) num_err_info_data = header["block10"]['number_of_error_info_data'][ 0] err_info_data = [] for i in range(num_err_info_data): err_info_data.append(np.fromfile(fp_, dtype=dtype, count=1)) header['error_information_data'] = err_info_data fp_.seek(40, 1) np.fromfile(fp_, dtype=_SPARE_TYPE, count=1) nlines = int(header["block2"]['number_of_lines'][0]) ncols = int(header["block2"]['number_of_columns'][0]) res = da.from_array(np.memmap(self.filename, offset=fp_.tell(), dtype='<u2', shape=(nlines, ncols), mode='r'), chunks=CHUNK_SIZE) res = da.where(res == 65535, np.float32(np.nan), res) self._header = header logger.debug("Reading time " + str(datetime.now() - tic)) res = self.calibrate(res, key.calibration) new_info = dict( units=info['units'], standard_name=info['standard_name'], wavelength=info['wavelength'], resolution='resolution', id=key, name=key.name, scheduled_time=self.scheduled_time, platform_name=self.platform_name, sensor=self.sensor, satellite_longitude=float(self.nav_info['SSP_longitude']), satellite_latitude=float(self.nav_info['SSP_latitude']), satellite_altitude=float( self.nav_info['distance_earth_center_to_satellite'] - self.proj_info['earth_equatorial_radius']) * 1000) res = xr.DataArray(res, attrs=new_info, dims=['y', 'x']) res = res.where( header['block5']["count_value_outside_scan_pixels"][0] != res) res = res.where(header['block5']["count_value_error_pixels"][0] != res) res = res.where(self.geo_mask()) return res
def from_bcolz(x, chunksize=None, categorize=True, index=None, lock=lock, **kwargs): """ Read BColz CTable into a Dask Dataframe BColz is a fast on-disk compressed column store with careful attention given to compression. https://bcolz.readthedocs.io/en/latest/ Parameters ---------- x : bcolz.ctable chunksize : int, optional The size of blocks to pull out from ctable. categorize : bool, defaults to True Automatically categorize all string dtypes index : string, optional Column to make the index lock: bool or Lock Lock to use when reading or False for no lock (not-thread-safe) See Also -------- from_array: more generic function not optimized for bcolz """ if lock is True: lock = Lock() import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names), )) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = tuple(range(0, len(x), chunksize)) divisions = divisions + (len(x) - 1, ) if x.rootdir: token = tokenize((x.rootdir, os.path.getmtime(x.rootdir)), chunksize, categorize, index, kwargs) else: token = tokenize((id(x), x.shape, x.dtype), chunksize, categorize, index, kwargs) new_name = 'from_bcolz-' + token dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize), ), columns, categories, lock)) for i in range(0, int(ceil(len(x) / chunksize)))) meta = dataframe_from_ctable(x, slice(0, 0), columns, categories, lock) result = DataFrame(dsk, new_name, meta, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names), )) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = tuple(da.percentile(a, q).compute()) return set_partition(result, index, divisions, **kwargs) else: return result