def stadistics(self): headers = ["group", "mean", "std dev", "min", "25%", "50%", "75%", "max", "nonzero", "nonan", "unique", "dtype"] self.chunksize = Chunks.build_from_shape(self.shape, self.dtypes) table = [] for group, (dtype, _) in self.dtypes.fields.items(): values = dict() values["dtype"] = dtype values["group"] = group darray = self.data[group].da if dtype == np.dtype(float) or dtype == np.dtype(int): da_mean = da.around(darray.mean(), decimals=3) da_std = da.around(darray.std(), decimals=3) da_min = da.around(darray.min(), decimals=3) da_max = da.around(darray.max(), decimals=3) result = dask.compute([da_mean, da_std, da_min, da_max])[0] values["mean"] = result[0] if not np.isnan(result[0]) else da.around(da.nanmean(darray), decimals=3).compute() values["std dev"] = result[1] if not np.isnan(result[0]) else da.around(da.nanstd(darray), decimals=3).compute() values["min"] = result[2] if not np.isnan(result[0]) else da.around(da.nanmin(darray), decimals=3).compute() values["max"] = result[3] if not np.isnan(result[0]) else da.around(da.nanmax(darray), decimals=3).compute() if len(self.shape[group]) == 1: da_percentile = da.around(da.percentile(darray, [25, 50, 75]), decimals=3) result = da_percentile.compute() values["25%"] = result[0] values["50%"] = result[1] values["75%"] = result[2] else: values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = da.count_nonzero(darray).compute() values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() values["unique"] = "-" else: values["mean"] = "-" values["std dev"] = "-" values["min"] = "-" values["max"] = "-" values["25%"] = "-" values["50%"] = "-" values["75%"] = "-" values["nonzero"] = "-" values["nonan"] = da.count_nonzero(da.notnull(darray)).compute() vunique = darray.to_dask_dataframe().fillna('').nunique().compute() values["unique"] = vunique row = [] for column in headers: row.append(values[column]) table.append(row) print("# rows {}".format(self.shape[0])) return tabulate(table, headers)
def _band_hist(band_data): cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth) if approximate: # need a 1D array flat_data = band_data.ravel() # replace with nanpercentile in the future, if available # dask < 0.17 returns all NaNs for this bins = da.percentile(flat_data[da.notnull(flat_data)], cdf * 100.) else: bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.) bins = da.from_delayed(bins, shape=(nwidth,), dtype=cdf.dtype) res = dask.delayed(np.interp)(band_data, bins, cdf) res = da.from_delayed(res, shape=band_data.shape, dtype=band_data.dtype) return res
def _band_hist(band_data): cdf = da.arange(0., 1., 1. / nwidth, chunks=nwidth) if approximate: # need a 1D array flat_data = band_data.ravel() # replace with nanpercentile in the future, if available # dask < 0.17 returns all NaNs for this bins = da.percentile(flat_data[da.notnull(flat_data)], cdf * 100.) else: bins = dask.delayed(np.nanpercentile)(band_data, cdf * 100.) bins = da.from_delayed(bins, shape=(nwidth, ), dtype=cdf.dtype) res = dask.delayed(np.interp)(band_data, bins, cdf) res = da.from_delayed(res, shape=band_data.shape, dtype=band_data.dtype) return res
def test_dtype_complex(): x = np.arange(24).reshape((4, 6)).astype('f4') y = np.arange(24).reshape((4, 6)).astype('i8') z = np.arange(24).reshape((4, 6)).astype('i2') a = da.from_array(x, chunks=(2, 3)) b = da.from_array(y, chunks=(2, 3)) c = da.from_array(z, chunks=(2, 3)) def eq(a, b): return (isinstance(a, np.dtype) and isinstance(b, np.dtype) and str(a) == str(b)) assert eq(a._dtype, x.dtype) assert eq(b._dtype, y.dtype) assert eq((a + 1)._dtype, (x + 1).dtype) assert eq((a + b)._dtype, (x + y).dtype) assert eq(a.T._dtype, x.T.dtype) assert eq(a[:3]._dtype, x[:3].dtype) assert eq((a.dot(b.T))._dtype, (x.dot(y.T)).dtype) assert eq(stack([a, b])._dtype, np.vstack([x, y]).dtype) assert eq(concatenate([a, b])._dtype, np.concatenate([x, y]).dtype) assert eq(b.std()._dtype, y.std().dtype) assert eq(c.sum()._dtype, z.sum().dtype) assert eq(a.min()._dtype, a.min().dtype) assert eq(b.std()._dtype, b.std().dtype) assert eq(a.argmin(axis=0)._dtype, a.argmin(axis=0).dtype) assert eq(da.sin(c)._dtype, np.sin(z).dtype) assert eq(da.exp(b)._dtype, np.exp(y).dtype) assert eq(da.floor(a)._dtype, np.floor(x).dtype) assert eq(da.isnan(b)._dtype, np.isnan(y).dtype) with ignoring(ImportError): assert da.isnull(b)._dtype == 'bool' assert da.notnull(b)._dtype == 'bool' x = np.array([('a', 1)], dtype=[('text', 'S1'), ('numbers', 'i4')]) d = da.from_array(x, chunks=(1,)) assert eq(d['text']._dtype, x['text'].dtype) assert eq(d[['numbers', 'text']]._dtype, x[['numbers', 'text']].dtype)
def test_dtype_complex(): x = np.arange(24).reshape((4, 6)).astype('f4') y = np.arange(24).reshape((4, 6)).astype('i8') z = np.arange(24).reshape((4, 6)).astype('i2') a = da.from_array(x, chunks=(2, 3)) b = da.from_array(y, chunks=(2, 3)) c = da.from_array(z, chunks=(2, 3)) def eq(a, b): return (isinstance(a, np.dtype) and isinstance(b, np.dtype) and str(a) == str(b)) assert eq(a._dtype, x.dtype) assert eq(b._dtype, y.dtype) assert eq((a + 1)._dtype, (x + 1).dtype) assert eq((a + b)._dtype, (x + y).dtype) assert eq(a.T._dtype, x.T.dtype) assert eq(a[:3]._dtype, x[:3].dtype) assert eq((a.dot(b.T))._dtype, (x.dot(y.T)).dtype) assert eq(stack([a, b])._dtype, np.vstack([x, y]).dtype) assert eq(concatenate([a, b])._dtype, np.concatenate([x, y]).dtype) assert eq(b.std()._dtype, y.std().dtype) assert eq(c.sum()._dtype, z.sum().dtype) assert eq(a.min()._dtype, a.min().dtype) assert eq(b.std()._dtype, b.std().dtype) assert eq(a.argmin(axis=0)._dtype, a.argmin(axis=0).dtype) assert eq(da.sin(c)._dtype, np.sin(z).dtype) assert eq(da.exp(b)._dtype, np.exp(y).dtype) assert eq(da.floor(a)._dtype, np.floor(x).dtype) assert eq(da.isnan(b)._dtype, np.isnan(y).dtype) with ignoring(ImportError): assert da.isnull(b)._dtype == 'bool' assert da.notnull(b)._dtype == 'bool' x = np.array([('a', 1)], dtype=[('text', 'S1'), ('numbers', 'i4')]) d = da.from_array(x, chunks=(1, )) assert eq(d['text']._dtype, x['text'].dtype) assert eq(d[['numbers', 'text']]._dtype, x[['numbers', 'text']].dtype)
def push(array, n, axis): """ Dask-aware bottleneck.push """ import bottleneck import dask.array as da import numpy as np def _fill_with_last_one(a, b): # cumreduction apply the push func over all the blocks first so, the only missing part is filling # the missing values using the last data of the previous chunk return np.where(~np.isnan(b), b, a) if n is not None and 0 < n < array.shape[axis] - 1: arange = da.broadcast_to( da.arange(array.shape[axis], chunks=array.chunks[axis], dtype=array.dtype).reshape( tuple(size if i == axis else 1 for i, size in enumerate(array.shape))), array.shape, array.chunks, ) valid_arange = da.where(da.notnull(array), arange, np.nan) valid_limits = (arange - push(valid_arange, None, axis)) <= n # omit the forward fill that violate the limit return da.where(valid_limits, push(array, None, axis), np.nan) # The method parameter makes that the tests for python 3.7 fails. return da.reductions.cumreduction( func=bottleneck.push, binop=_fill_with_last_one, ident=np.nan, x=array, axis=axis, dtype=array.dtype, )
def test_isnull(): x = np.array([1, np.nan]) a = da.from_array(x, chunks=(2, )) with ignoring(ImportError): assert_eq(da.isnull(a), np.isnan(x)) assert_eq(da.notnull(a), ~np.isnan(x))
def test_isnull(): x = np.array([1, np.nan]) a = da.from_array(x, chunks=(2,)) with ignoring(ImportError): assert_eq(da.isnull(a), np.isnan(x)) assert_eq(da.notnull(a), ~np.isnan(x))
def boundary_weights(self, mode='reflect', mask=None, drop_dims=[], compute=False): """ Compute the boundary weights Parameters ---------- mode : {'reflect', 'periodic', 'any-constant'}, optional The mode parameter determines how the array borders are handled. Default is 'reflect'. mask : array-like, optional Specify the mask, if None the mask is inferred from missing values drop_dims : list, optional Specify dimensions along which the weights do not need to be computed compute : bool, optional If True, the computation is performed after the dask graph has been made. If False, only the dask graph is made is the computation will be performed later on. Returns ------- weights : xarray.DataArray Return a DataArray containing the weights """ if mode is 'periodic': mode_conv = 'wrap' else: mode_conv = mode # Normalize coefficients coeffs = self.coefficients / self.coefficients.sum() if drop_dims: new_coeffs = da.squeeze( coeffs, axis=[self.obj.get_axis_num(di) for di in drop_dims]) else: new_coeffs = coeffs new_obj = self.obj.isel(**{di: 0 for di in drop_dims}).squeeze() #depth = {new_obj.get_axis_num(di): self.order[di] // 2 # for di in self.dims} boundary = {self._obj.get_axis_num(di): mode for di in self.dims} if mask is None: mask = da.notnull(new_obj.data) conv = lambda x: im.convolve(x, new_coeffs, mode=mode_conv) weights = mask.astype(float).map_overlap(conv, depth=self._depth, boundary=boundary, trim=True) res = xr.DataArray(mask * weights, dims=new_obj.dims, coords=new_obj.coords, name='boundary_weights') res = res.where(res != 0) if compute: with ProgressBar(): out = res.compute() else: out = res return out