def func(a, axes=None): if len(axes) > 3: raise ValueError("Detrending is only supported up to " "3 dimensions.") if axes is None: axes = tuple(range(a.ndim)) else: if len(set(axes)) < len(axes): raise ValueError("Duplicate axes are not allowed.") for each_axis in axes: if len(a.chunks[each_axis]) != 1: raise ValueError('The axis along the detrending is upon ' 'cannot be chunked.') if len(axes) == 1: return dsar.map_blocks(sps.detrend, a, axis=axes[0], chunks=a.chunks, dtype=a.dtype ) else: for each_axis in range(a.ndim): if each_axis not in axes: if len(a.chunks[each_axis]) != a.shape[each_axis]: raise ValueError("The axes other than ones to detrend " "over should have a chunk length of 1.") return dsar.map_blocks(detrend_func, a, axes, chunks=a.chunks, dtype=a.dtype )
def align(self, target=None, weighted=False, procrustes=False, error=0.0001, maxcyc=10): ''' Aligns the frames in atrajectory to some reference structure, with optional mass-weighting. Arguments: target: If given, a reference structure to fir to, as a [N,3] numpy array. weighted: If specified, mass-weighted fitting is done. procrustes: If specified , procrustes iterative fitting is done to convergence. error: Defines the target error for the procrustes fit. maxcyc: Defines the maximum number of iterations for the procrustes method. ''' self.reset() if target is None: targ = self.x[0] else: targ = da.from_array(target, chunks = CHUNKS) if weighted: weights = self.masses else: weights = np.ones_like(self.masses) weights = da.from_array(np.stack([weights,] * 3).T, chunks=CHUNKS) self.x = da.map_blocks(fastfitting.fitted_traj, self.x, targ, weights) if not procrustes: return avg = self.x.mean(axis=0) err = avg - targ err = (err*err).mean().compute() cycle = 1 while err > error and cycle < maxcyc: target = avg self.reset() self.x = da.map_blocks(fastfitting.fitted_traj, self.x, target, weights) avg = self.x.mean(axis=0).compute() avg = da.from_array(avg, chunks=CHUNKS) err = avg - target err = (err*err).mean().compute() cycle += 1 print 'Procrustes converged in {} cycles with error {}'.format(cycle, err)
def lazy_elementwise(lazy_array, elementwise_op): """ Apply a (numpy-style) elementwise array operation to a lazy array. Elementwise means that it performs a independent calculation at each point of the input, producing a result array of the same shape. Args: * lazy_array: The lazy array object to operate on. * elementwise_op: The elementwise operation, a function operating on numpy arrays. .. note: A single-point "dummy" call is made to the operation function, to determine dtype of the result. This return dtype must be stable in actual operation (!) """ # This is just a wrapper to provide an Iris-specific abstraction for a # lazy operation in Dask (map_blocks). # Explicitly determine the return type with a dummy call. # This makes good practical sense for unit conversions, as a Unit.convert # call may cast to float, or not, depending on unit equality : Thus, it's # much safer to get udunits to decide that for us. dtype = elementwise_op(np.zeros(1, lazy_array.dtype)).dtype return da.map_blocks(elementwise_op, lazy_array, dtype=dtype)
def _get_date_field(values, name, dtype): """Indirectly access pandas' libts.get_date_field by wrapping data as a Series and calling through `.dt` attribute. Parameters ---------- values : np.ndarray or dask.array-like Array-like container of datetime-like values name : str Name of datetime field to access dtype : dtype-like dtype for output date field values Returns ------- datetime_fields : same type as values Array-like of datetime fields accessed for each element in values """ if isinstance(values, dask_array_type): from dask.array import map_blocks return map_blocks(_access_through_series, values, name, dtype=dtype) else: return _access_through_series(values, name)
def _method(self, method_name, chunks=None, drop_axis=None, **kwargs): if chunks is None: # no shape change chunks = self.chunks if self.mask is None: # simple case, no mask def f(block): g = _ndarray.GenotypeArray(block) method = getattr(g, method_name) return method(**kwargs) out = self.map_blocks(f, chunks=chunks, drop_axis=drop_axis) else: # map with mask def f(block, bmask): g = _ndarray.GenotypeArray(block) g.mask = bmask[:, :, 0] method = getattr(g, method_name) return method(**kwargs) m = self.mask[:, :, None] out = da.map_blocks(f, self, m, chunks=chunks, drop_axis=drop_axis) return out
def from_packed(packed, chunks=None): def f(block): return _ndarray.GenotypeArray.from_packed(block) packed = ensure_dask_array(packed, chunks) chunks = (packed.chunks[0], packed.chunks[1], (2,)) out = da.map_blocks(f, packed, chunks=chunks, new_axis=2) return view_subclass(out, GenotypeDaskArray)
def interpolate_angles(self, angles, resolution): # FIXME: interpolate in cartesian coordinates if the lons or lats are # problematic from geotiepoints.multilinear import MultilinearInterpolator geocoding = self.root.find('.//Tile_Geocoding') rows = int(geocoding.find('Size[@resolution="' + str(resolution) + '"]/NROWS').text) cols = int(geocoding.find('Size[@resolution="' + str(resolution) + '"]/NCOLS').text) smin = [0, 0] smax = np.array(angles.shape) - 1 orders = angles.shape minterp = MultilinearInterpolator(smin, smax, orders) minterp.set_values(da.atleast_2d(angles.ravel())) def _do_interp(minterp, xcoord, ycoord): interp_points2 = np.vstack((xcoord.ravel(), ycoord.ravel())) res = minterp(interp_points2) return res.reshape(xcoord.shape) x = da.arange(rows, dtype=angles.dtype, chunks=CHUNK_SIZE) / (rows-1) * (angles.shape[0] - 1) y = da.arange(cols, dtype=angles.dtype, chunks=CHUNK_SIZE) / (cols-1) * (angles.shape[1] - 1) xcoord, ycoord = da.meshgrid(x, y) return da.map_blocks(_do_interp, minterp, xcoord, ycoord, dtype=angles.dtype, chunks=xcoord.chunks)
def char_to_bytes(arr): """Convert numpy/dask arrays from characters to fixed width bytes.""" if arr.dtype != 'S1': raise ValueError("argument must have dtype='S1'") if not arr.ndim: # no dimension to concatenate along return arr size = arr.shape[-1] if not size: # can't make an S0 dtype return np.zeros(arr.shape[:-1], dtype=np.string_) if isinstance(arr, dask_array_type): import dask.array as da if len(arr.chunks[-1]) > 1: raise ValueError('cannot stacked dask character array with ' 'multiple chunks in the last dimension: {}' .format(arr)) dtype = np.dtype('S' + str(arr.shape[-1])) return da.map_blocks(_numpy_char_to_bytes, arr, dtype=dtype, chunks=arr.chunks[:-1], drop_axis=[arr.ndim - 1]) else: return StackedBytesArray(arr)
def gufunc_idxmin(x, y, axis=None): import dask.array as da indx = x.argmin(axis=axis) func = functools.partial(_index_from_1d_array, y) if isinstance(x, da.Array): return da.map_blocks(func, indx, dtype=indx.dtype) else: return func(indx)
def _get_solar_flux(self, band): """Get the solar flux for the band.""" solar_flux = self.cal['solar_flux'].isel(bands=band).values d_index = self.cal['detector_index'].fillna(0).astype(int) def get_items(idx, solar_flux): return solar_flux[idx] return da.map_blocks(get_items, d_index.data, solar_flux=solar_flux, dtype=solar_flux.dtype)
def interp_func(var, x, new_x, method, kwargs): """ multi-dimensional interpolation for array-like. Interpolated axes should be located in the last position. Parameters ---------- var: np.ndarray or dask.array.Array Array to be interpolated. The final dimension is interpolated. x: a list of 1d array. Original coordinates. Should not contain NaN. new_x: a list of 1d array New coordinates. Should not contain NaN. method: string {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic'} for 1-dimensional itnterpolation. {'linear', 'nearest'} for multidimensional interpolation **kwargs: Optional keyword arguments to be passed to scipy.interpolator Returns ------- interpolated: array Interpolated array Note ---- This requiers scipy installed. See Also -------- scipy.interpolate.interp1d """ if not x: return var.copy() if len(x) == 1: func, kwargs = _get_interpolator(method, vectorizeable_only=True, **kwargs) else: func, kwargs = _get_interpolator_nd(method, **kwargs) if isinstance(var, dask_array_type): import dask.array as da _assert_single_chunk(var, range(var.ndim - len(x), var.ndim)) chunks = var.chunks[:-len(x)] + new_x[0].shape drop_axis = range(var.ndim - len(x), var.ndim) new_axis = range(var.ndim - len(x), var.ndim - len(x) + new_x[0].ndim) return da.map_blocks(_interpnd, var, x, new_x, func, kwargs, dtype=var.dtype, chunks=chunks, new_axis=new_axis, drop_axis=drop_axis) return _interpnd(var, x, new_x, func, kwargs)
def map_alleles(self, mapping): def f(block, bmapping): ac = _ndarray.AlleleCountsArray(block) return ac.map_alleles(bmapping) # obtain dask array mapping = da.from_array(mapping, chunks=(self.chunks[0], None)) # map blocks out = da.map_blocks(f, self, mapping, chunks=self.chunks) return view_subclass(out, AlleleCountsDaskArray)
def bytes_to_char(arr): """Convert numpy/dask arrays from fixed width bytes to characters.""" if arr.dtype.kind != 'S': raise ValueError('argument must have a fixed-width bytes dtype') if isinstance(arr, dask_array_type): import dask.array as da return da.map_blocks(_numpy_bytes_to_char, arr, dtype='S1', chunks=arr.chunks + ((arr.dtype.itemsize,)), new_axis=[arr.ndim]) else: return _numpy_bytes_to_char(arr)
def map_alleles(self, mapping, **kwargs): def f(block, bmapping): h = _ndarray.HaplotypeArray(block) return h.map_alleles(bmapping) # obtain dask array mapping = da.from_array(mapping, chunks=(self.chunks[0], None)) # map blocks out = da.map_blocks(f, self, mapping, chunks=self.chunks) return view_subclass(out, HaplotypeDaskArray)
def map_alleles(self, mapping, **kwargs): def f(block, bmapping): g = _ndarray.GenotypeArray(block) m = bmapping[:, 0, :] return g.map_alleles(m) # obtain dask array mapping = da.from_array(mapping, chunks=(self.chunks[0], None)) # map blocks out = da.map_blocks(f, self, mapping[:, None, :], chunks=self.chunks) return view_subclass(out, GenotypeDaskArray)
def HaloVelocityDispersion(mass, cosmo, redshift, mdef='vir'): """ Compute the velocity dispersion of halo from Mass. This is a simple model suggested by Martin White. See http://adsabs.harvard.edu/abs/2008ApJ...672..122E """ mass, redshift = da.broadcast_arrays(mass, redshift) def compute_vdisp(mass, redshift): h = cosmo.efunc(redshift) return 1100. * (h * mass / 1e15) ** 0.33333 return da.map_blocks(compute_vdisp, mass, redshift, dtype=mass.dtype)
def HaloConcentration(mass, cosmo, redshift, mdef='vir'): """ Return halo concentration from halo mass, based on the analytic fitting formulas presented in `Dutton and Maccio 2014 <https://arxiv.org/abs/1402.7073>`_. .. note:: The units of the input mass are assumed to be :math:`M_{\odot}/h` Parameters ---------- mass : array_like either a numpy or dask array specifying the halo mass; units assumed to be :math:`M_{\odot}/h` cosmo : :class:`~nbodykit.cosmology.cosmology.Cosmology` the cosmology instance used in the analytic formula redshift : float compute the c(M) relation at this redshift mdef : str, optional string specifying the halo mass definition to use; should be 'vir' or 'XXXc' or 'XXXm' where 'XXX' is an int specifying the overdensity Returns ------- concen : :class:`dask.array.Array` a dask array holding the analytic concentration values References ---------- Dutton and Maccio, "Cold dark matter haloes in the Planck era: evolution of structural parameters for Einasto and NFW profiles", 2014, arxiv:1402.7073 """ from halotools.empirical_models import NFWProfile mass, redshift = da.broadcast_arrays(mass, redshift) kws = {'cosmology':cosmo.to_astropy(), 'conc_mass_model':'dutton_maccio14', 'mdef':mdef} def get_nfw_conc(mass, redshift): kw1 = {} kw1.update(kws) kw1['redshift'] = redshift model = NFWProfile(**kw1) return model.conc_NFWmodel(prim_haloprop=mass) return da.map_blocks(get_nfw_conc, mass, redshift, dtype=mass.dtype)
def HaloRadius(mass, cosmo, redshift, mdef='vir'): r""" Return proper halo radius from halo mass, based on the specified mass definition. This is independent of halo profile, and simply returns .. math:: R = \left [ 3 M /(4\pi\Delta) \right]^{1/3} where :math:`\Delta` is the density threshold, which depends on cosmology, redshift, and mass definition .. note:: The units of the input mass are assumed to be :math:`M_{\odot}/h` Parameters ---------- mass : array_like either a numpy or dask array specifying the halo mass; units assumed to be :math:`M_{\odot}/h` cosmo : :class:`~nbodykit.cosmology.cosmology.Cosmology` the cosmology instance redshift : float compute the density threshold which determines the R(M) relation at this redshift mdef : str, optional string specifying the halo mass definition to use; should be 'vir' or 'XXXc' or 'XXXm' where 'XXX' is an int specifying the overdensity Returns ------- radius : :class:`dask.array.Array` a dask array holding the halo radius in 'physical Mpc/h [sic]'. This is proper Mpc/h, to convert to comoving, divide this by scaling factor. """ from halotools.empirical_models import halo_mass_to_halo_radius mass, redshift = da.broadcast_arrays(mass, redshift) kws = {'cosmology':cosmo.to_astropy(), 'mdef':mdef} def mass_to_radius(mass, redshift): return halo_mass_to_halo_radius(mass=mass, redshift=redshift, **kws) return da.map_blocks(mass_to_radius, mass, redshift, dtype=mass.dtype)
def interpolate_xarray_linear(xpoints, ypoints, values, shape): """Interpolate linearly, generating a dask array.""" from scipy.interpolate.interpnd import (LinearNDInterpolator, _ndim_coords_from_arrays) points = _ndim_coords_from_arrays(np.vstack((np.asarray(ypoints), np.asarray(xpoints))).T) interpolator = LinearNDInterpolator(points, values) def intp(grid_x, grid_y, interpolator): return interpolator((grid_y, grid_x)) grid_x, grid_y = da.meshgrid(da.arange(shape[1], chunks=CHUNK_SIZE), da.arange(shape[0], chunks=CHUNK_SIZE)) # workaround for non-thread-safe first call of the interpolator: interpolator((0, 0)) res = da.map_blocks(intp, grid_x, grid_y, interpolator=interpolator) return DataArray(res, dims=('y', 'x'))
def precompute(self, mask=None, cache_dir=False, swath_usage=0, **kwargs): """Generate row and column arrays and store it for later use. :param swath_usage: minimum ratio of number of input pixels to number of pixels used in output Note: The `mask` keyword should be provided if geolocation may be valid where data points are invalid. This defaults to the `mask` attribute of the `data` numpy masked array passed to the `resample` method. """ del kwargs source_geo_def = self.source_geo_def target_geo_def = self.target_geo_def if cache_dir: LOG.warning("'cache_dir' is not used by EWA resampling") # SatPy/PyResample don't support dynamic grids out of the box yet lons, lats = source_geo_def.get_lonlats() # we are remapping to a static unchanging grid/area with all of # its parameters specified chunks = (2,) + lons.chunks res = da.map_blocks(self._call_ll2cr, lons.data, lats.data, target_geo_def, swath_usage, dtype=lons.dtype, chunks=chunks, new_axis=[0]) cols = res[0] rows = res[1] # save the dask arrays in the class instance cache # the on-disk cache will store the numpy arrays self.cache = { "rows": rows, "cols": cols, } return None
def count_alleles(self, max_allele=None, subpop=None): # if max_allele not specified, count all alleles if max_allele is None: max_allele = self.max().compute()[()] # deal with subpop if subpop: gd = self.take(subpop, axis=1) else: gd = self # determine output chunks - preserve axis0; change axis1, axis2 chunks = (gd.chunks[0], (1,)*len(gd.chunks[1]), (max_allele+1,)) if self.mask is None: # simple case, no mask def f(block): gb = _ndarray.GenotypeArray(block) return gb.count_alleles(max_allele=max_allele)[:, None, :] # map blocks and reduce out = gd.map_blocks(f, chunks=chunks).sum(axis=1) else: # map with mask def f(block, bmask): g = _ndarray.GenotypeArray(block) g.mask = bmask[:, :, 0] return g.count_alleles(max_allele=max_allele)[:, None, :] md = self.mask[:, :, None] out = da.map_blocks(f, gd, md, chunks=chunks).sum(axis=1) return view_subclass(out, AlleleCountsDaskArray)
def dask_rolling_wrapper(moving_func, a, window, min_count=None, axis=-1): """Wrapper to apply bottleneck moving window funcs on dask arrays""" import dask.array as da dtype, fill_value = dtypes.maybe_promote(a.dtype) a = a.astype(dtype) # inputs for overlap if axis < 0: axis = a.ndim + axis depth = {d: 0 for d in range(a.ndim)} depth[axis] = (window + 1) // 2 boundary = {d: fill_value for d in range(a.ndim)} # Create overlap array. ag = da.overlap.overlap(a, depth=depth, boundary=boundary) # apply rolling func out = da.map_blocks(moving_func, ag, window, min_count=min_count, axis=axis, dtype=a.dtype) # trim array result = da.overlap.trim_internal(out, depth) return result
def to_dask(self, pad_values=None): num_channels = self.bounding_shape["c"] if self.needs_padding: if pad_values is None: raise ValueError( "Data must be padded but no pad values were supplied!") elif len(pad_values) != num_channels: raise ValueError( f"Length of pad values {pad_values} does not match the length of the channel axis ({num_channels})" ) chunks = ( (1, ) * self.bounding_shape["z"], *[self.bounding_shape[k] for k in ("c", "y", "x")], ) darr = da.map_blocks( _chunked_fibsem_loader, self.filenames, self.axes["c"], pad_values, chunks=chunks, dtype=self.dtypes[0], ) return darr
def inverse_transform(self, y: Union[ArrayLike, SeriesType]): check_is_fitted(self, "classes_") y = self._check_array(y) if isinstance(y, da.Array): if getattr(self, "dtype_", None): # -> Series[category] if self.dtype_ is not None: result = (dd.from_dask_array(y).astype( "category").cat.set_categories( np.arange(len( self.classes_))).cat.rename_categories( self.dtype_.categories)) if self.dtype_.ordered: result = result.cat.as_ordered() return result else: return da.map_blocks( getitem, self.classes_, y, dtype=self.classes_.dtype, chunks=y.chunks, ) else: y = np.asarray(y) if getattr(self, "dtype_", None): if self.dtype_ is not None: return pd.Series( pd.Categorical.from_codes( y, categories=self.dtype_.categories, ordered=self.dtype_.ordered, )) else: return self.classes_[y]
def _round_field(values, name, freq): """Indirectly access pandas rounding functions by wrapping data as a Series and calling through `.dt` attribute. Parameters ---------- values : np.ndarray or dask.array-like Array-like container of datetime-like values name : str (ceil, floor, round) Name of rounding function freq : a freq string indicating the rounding resolution Returns ------- rounded timestamps : same type as values Array-like of datetime fields accessed for each element in values """ if isinstance(values, dask_array_type): from dask.array import map_blocks return map_blocks(_round_series, values, name, freq=freq, dtype=np.datetime64) else: return _round_series(values, name, freq)
def gap_fill(x: xr.DataArray, fallback: xr.DataArray, nodata=None, attrs=None): """Fill missing values in `x` with values from `fallback`. x,fallback are expected to be xarray.DataArray with identical shape and dtype. out[pix] = x[pix] if x[pix] != x.nodata else fallback[pix] """ if nodata is None: nodata = getattr(x, "nodata", None) if nodata is None: nodata = default_nodata(x.dtype) else: nodata = x.dtype.type(nodata) if attrs is None: attrs = x.attrs.copy() if dask.is_dask_collection(x): data = da.map_blocks( _gap_fill_np, x.data, fallback.data, nodata, name=randomize("gap_fill"), dtype=x.dtype, ) else: data = _gap_fill_np(x.data, fallback.data, nodata) return xr.DataArray(data, attrs=attrs, dims=x.dims, coords=x.coords, name=x.name)
def precompute(self, cache_dir=None, swath_usage=0, **kwargs): """Generate row and column arrays and store it for later use.""" if kwargs.get('mask') is not None: LOG.warning("'mask' parameter has no affect during EWA " "resampling") del kwargs source_geo_def = self.source_geo_def target_geo_def = self.target_geo_def if cache_dir: LOG.warning("'cache_dir' is not used by EWA resampling") # SatPy/PyResample don't support dynamic grids out of the box yet lons, lats = source_geo_def.get_lonlats() if isinstance(lons, xr.DataArray): # get dask arrays lons = lons.data lats = lats.data # we are remapping to a static unchanging grid/area with all of # its parameters specified chunks = (2,) + lons.chunks res = da.map_blocks(self._call_ll2cr, lons, lats, target_geo_def, swath_usage, dtype=lons.dtype, chunks=chunks, new_axis=[0]) cols = res[0] rows = res[1] # save the dask arrays in the class instance cache # the on-disk cache will store the numpy arrays self.cache = { "rows": rows, "cols": cols, } return None
def get_slp(daskArray, omp_threads=1): t = fetch_variable(daskArray, "T") p = fetch_variable(daskArray, "P") pb = fetch_variable(daskArray, "PB") qvapor = fetch_variable(daskArray, "QVAPOR", include_meta=True) ph = fetch_variable(daskArray, "PH") phb = fetch_variable(daskArray, "PHB") dtype = p.dtype full_t = map_blocks(wrapped_add, t, Constants.T_BASE, dtype=dtype) full_p = map_blocks(wrapped_add, p, pb, dtype=dtype) qvapor = qvapor.where(qvapor >= 0, 0) del (t) del (p) del (pb) pre_full_ph = map_blocks(wrapped_add, ph, phb, dtype=dtype) full_ph = map_blocks(wrapped_div, pre_full_ph, Constants.G, dtype=dtype) destag_ph = wrapped_destagger(full_ph, -3) del (full_ph) del (ph) del (phb) tk = map_blocks(tk_wrap, full_p, full_t, omp_threads, dtype=dtype) slp = map_blocks(slp_wrap, destag_ph, tk, full_p, qvapor.data, omp_threads, dtype=dtype) slp_calc = slp return slp_calc
import dask.array as da from dask.distributed import Client import numpy as np client = Client(scheduler_file='/home/mm/scheduler.json') client ntime=200 npool=9 npatch=10 nland=1000 chunk_shape=(ntime,npool,npool,npatch ,1) narr=np.ones(chunk_shape) #print(narr) arr1=da.from_array( narr) #print(arr1) garr1=da.stack([ arr1 for i in range(nland)],axis=4) #print(garr1) def myfunc(chunk1,chunk2): return chunk1+chunk2 res=da.map_blocks( myfunc, garr1, garr1 ) res # %time res.compute()
def local_affine_to_position_field(shape, spacing, local_affines, output, blocksize=[ 256, ] * 3): """ """ with distributed.distributedState() as ds: # get number of jobs needed block_grid = np.ceil(np.array(shape) / blocksize).astype(int) nblocks = np.prod(block_grid) # set up the cluster ds.initializeLSFCluster( job_extra=["-P multifish"], cores=4, memory="64GB", ncpus=4, threads_per_worker=8, mem=64000, ) ds.initializeClient() ds.scaleCluster(njobs=nblocks) # augment the blocksize by the fixed overlap size pads = [2 * int(round(x / 8)) for x in blocksize] blocksize_with_overlap = np.array(blocksize) + pads # get a grid used for each affine grid = position_grid_dask(blocksize_with_overlap, list(blocksize_with_overlap)) grid = grid * spacing.astype(np.float32) # wrap local_affines as dask array local_affines_da = da.from_array(local_affines, chunks=(1, 1, 1, 3, 4)) # compute affine transforms as position coordinates, lazy dask arrays coords = da.map_blocks( affine_to_grid_dask, local_affines_da, grid=grid, displacement=True, new_axis=[5, 6], chunks=( 1, 1, 1, ) + tuple(grid.shape), dtype=np.float32, ) # stitch affine position fields coords = stitch.stitch_fields(coords, blocksize) # crop to original shape coords = coords[:shape[0], :shape[1], :shape[2]] # convert to position field coords = coords + position_grid_dask( shape, blocksize) * spacing.astype(np.float32) coords = da.around(coords, decimals=2) # write in parallel as 3D array to zarr file compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE) coords_disk = zarr.open( output, 'w', shape=coords.shape, chunks=tuple(blocksize + [ 3, ]), dtype=coords.dtype, compressor=compressor, ) da.to_zarr(coords, coords_disk) # return pointer to zarr file return coords_disk
def preprocessing(dir_root, save_root, cameraNoiseMat=cameraNoiseMat, nsplit = (4, 4), num_t_chunks = 80,\ dask_tmp=None, memory_limit=0, is_bz2=False, is_singlePlane=False, down_sample_registration=1): from ..utils.getCameraInfo import getCameraInfo from tqdm import tqdm from ..utils.fileio import du # set worker cluster, client = fdask.setup_workers(is_local=True, dask_tmp=dask_tmp, memory_limit=memory_limit) print_client_links(cluster) if isinstance(save_root, list): save_root_ext = save_root[1] save_root = save_root[0] print(f'Tmp files will be saved to {save_root}') if 'save_root_ext' in locals(): print(f'With extended drive to {save_root_ext}') if not os.path.exists(f'{save_root}/denoised_data.zarr'): print('========================') print('Getting data infos') if not is_bz2: files = sorted(glob(dir_root+'/*.h5')) chunks = File(files[0],'r')['default'].shape if not is_singlePlane: data = da.stack([da.from_array(File(fn,'r')['default'], chunks=chunks) for fn in files]) else: if len(chunks)==2: data = da.stack([da.from_array(File(fn,'r')['default'], chunks=chunks) for fn in files]) else: data = da.concatenate([da.from_array(File(fn,'r')['default'], chunks=(1, chunks[1], chunks[2])) for fn in files], axis=0) cameraInfo = getCameraInfo(dir_root) else: import xml.etree.ElementTree as ET from utils import load_bz2file dims = ET.parse(dir_root+'/ch0.xml') root = dims.getroot() for info in root.findall('info'): if info.get('dimensions'): dims = info.get('dimensions') dims = dims.split('x') dims = [int(float(num)) for num in dims] files = sorted(glob(dir_root+'/*.stack.bz2')) imread = dask.delayed(lambda v: load_bz2file(v, dims), pure=True) lazy_data = [imread(fn) for fn in files] sample = lazy_data[0].compute() data = da.stack([da.from_delayed(fn, shape=sample.shape, dtype=sample.dtype) for fn in lazy_data]) cameraInfo = getCameraInfo(dir_root) pixel_x0, pixel_x1, pixel_y0, pixel_y1 = [int(_) for _ in cameraInfo['camera_roi'].split('_')] pixel_x0 = pixel_x0-1 pixel_y0 = pixel_y0-1 cameraInfo['camera_roi'] = '%d_%d_%d_%d'%(pixel_x0, pixel_x1, pixel_y0, pixel_y1) chunks = sample.shape # pixel denoise print('========================') print('Denoising camera noise') if not is_singlePlane: denoised_data = data.map_blocks(lambda v: pixelDenoiseImag(v, cameraNoiseMat=cameraNoiseMat, cameraInfo=cameraInfo)) else: denoised_data = data.map_blocks(lambda v: pixelDenoiseImag(v, cameraNoiseMat=cameraNoiseMat, cameraInfo=cameraInfo), new_axis=1) print('Denoising camera noise -- save data') denoised_data.to_zarr(f'{save_root}/denoised_data.zarr') num_t = denoised_data.shape[0] print('Denoising camera noise -- load saved data') denoised_data = da.from_zarr(f'{save_root}/denoised_data.zarr') chunks = denoised_data.shape[1:] num_t = denoised_data.shape[0] # save and compute reference image print('Compute reference image ---') if not os.path.exists(f'{save_root}/motion_fix_.h5'): med_win = len(denoised_data)//2 ref_img = denoised_data[med_win-50:med_win+50].mean(axis=0).compute() save_h5(f'{save_root}/motion_fix_.h5', ref_img, dtype='float16') print('--- Done computing reference image') # compute affine transform print('Registration to reference image ---') # create trans_affs file if not os.path.exists(f'{save_root}/trans_affs.npy'): ref_img = File(f'{save_root}/motion_fix_.h5', 'r')['default'].value ref_img = ref_img.max(axis=0, keepdims=True) if down_sample_registration==1: trans_affine = denoised_data.map_blocks(lambda x: estimate_rigid2d(x, fixed=ref_img), dtype='float32', drop_axis=(3), chunks=(1,4,4)).compute() else: #### downsample trans_affine case trans_affine = denoised_data[0::down_sample_registration].map_blocks(lambda x: estimate_rigid2d(x, fixed=ref_img), dtype='float32', drop_axis=(3), chunks=(1,4,4)).compute() len_dat = denoised_data.shape[0] trans_affine = rigid_interp(trans_affine, down_sample_registration, len_dat) # save trans_affs file np.save(f'{save_root}/trans_affs.npy', trans_affine) # load trans_affs file trans_affine_ = np.load(f'{save_root}/trans_affs.npy') trans_affine_ = da.from_array(trans_affine_, chunks=(1,4,4)) print('--- Done registration reference image') # apply affine transform if not os.path.exists(f'{save_root}/motion_corrected_data.zarr'): # fix memory issue to load data all together for transpose on local machine # load data # swap axes splits_ = np.array_split(np.arange(num_t).astype('int'), num_t_chunks) print(f'Processing total {num_t_chunks} chunks in time.......') # estimate size of data to store used_ = du(f'{save_root}/denoised_data.zarr/') est_data_size = int(used_.decode('utf-8'))//(2**20*num_t_chunks*2)+5 #kb to Gb for nz, n_split in enumerate(splits_): if not os.path.exists(save_root+'/motion_corrected_data_chunks_%03d.zarr'%(nz)): if 'save_root_ext' in locals(): if os.path.exists(save_root_ext+'/motion_corrected_data_chunks_%03d.zarr'%(nz)): continue print('Apply registration to rechunk layer %03d'%(nz)) trans_data_ = da.map_blocks(apply_transform3d, denoised_data[n_split], trans_affine_[n_split], chunks=(1, *denoised_data.shape[1:]), dtype='float16') print('Starting to rechunk layer %03d'%(nz)) trans_data_t_z = trans_data_.rechunk((-1, 1, chunks[1]//nsplit[0], chunks[2]//nsplit[1])).transpose((1, 2, 3, 0)) # check space availablity _, _, free_ = shutil.disk_usage(f'{save_root}/') if (free_//(2**30)) > est_data_size: print(f'Remaining space {free_//(2**30)} GB..... -- start to save at {save_root}') trans_data_t_z.to_zarr(save_root+'/motion_corrected_data_chunks_%03d.zarr'%(nz)) else: try: print(f'Remaining space {free_//(2**30)} GB..... -- start to save at {save_root_ext}') trans_data_t_z.to_zarr(save_root_ext+'/motion_corrected_data_chunks_%03d.zarr'%(nz)) except Exception as e: # if any error -- break the code print(e) fdask.terminate_workers(cluster, client) return None del trans_data_t_z gc.collect() print('finishing rechunking time chunk -- %03d of %03d'%(nz, num_t_chunks)) print('Remove temporal files of registration') if os.path.exists(f'{save_root}/denoised_data.zarr'): shutil.rmtree(f'{save_root}/denoised_data.zarr') for ext_files in tqdm(glob(save_root_ext+'/motion_corrected_data_chunks_*.zarr')): print(f'Moving file {ext_files} to Tmp-file folder.....') shutil.move(ext_files, save_root+'/') fdask.terminate_workers(cluster, client) return None
def preprocessing_cluster(dir_root, save_root, cameraNoiseMat=cameraNoiseMat, nsplit = (4, 4), num_t_chunks = 80,\ dask_tmp=None, memory_limit=0, is_bz2=False, is_singlePlane=False, down_sample_registration=1): from ..utils.getCameraInfo import getCameraInfo # set worker cluster, client = fdask.setup_workers(numCore=200, is_local=False, dask_tmp=dask_tmp, memory_limit=memory_limit) print_client_links(cluster) if not os.path.exists(f'{save_root}/denoised_data.zarr'): if not is_bz2: files = sorted(glob(dir_root+'/*.h5')) chunks = File(files[0],'r')['default'].shape if not is_singlePlane: data = da.stack([da.from_array(File(fn,'r')['default'], chunks=chunks) for fn in files]) else: if len(chunks)==2: data = da.stack([da.from_array(File(fn,'r')['default'], chunks=chunks) for fn in files]) else: data = da.concatenate([da.from_array(File(fn,'r')['default'], chunks=(1, chunks[1], chunks[2])) for fn in files], axis=0) cameraInfo = getCameraInfo(dir_root) else: import xml.etree.ElementTree as ET from utils import load_bz2file dims = ET.parse(dir_root+'/ch0.xml') root = dims.getroot() for info in root.findall('info'): if info.get('dimensions'): dims = info.get('dimensions') dims = dims.split('x') dims = [int(float(num)) for num in dims] files = sorted(glob(dir_root+'/*.stack.bz2')) imread = dask.delayed(lambda v: load_bz2file(v, dims), pure=True) lazy_data = [imread(fn) for fn in files] sample = lazy_data[0].compute() data = da.stack([da.from_delayed(fn, shape=sample.shape, dtype=sample.dtype) for fn in lazy_data]) cameraInfo = getCameraInfo(dir_root) pixel_x0, pixel_x1, pixel_y0, pixel_y1 = [int(_) for _ in cameraInfo['camera_roi'].split('_')] pixel_x0 = pixel_x0-1 pixel_y0 = pixel_y0-1 cameraInfo['camera_roi'] = '%d_%d_%d_%d'%(pixel_x0, pixel_x1, pixel_y0, pixel_y1) chunks = sample.shape # pixel denoise if not is_singlePlane: denoised_data = data.map_blocks(lambda v: pixelDenoiseImag(v, cameraNoiseMat=cameraNoiseMat, cameraInfo=cameraInfo)) else: denoised_data = data.map_blocks(lambda v: pixelDenoiseImag(v, cameraNoiseMat=cameraNoiseMat, cameraInfo=cameraInfo), new_axis=1) denoised_data.to_zarr(f'{save_root}/denoised_data.zarr') num_t = denoised_data.shape[0] else: denoised_data = da.from_zarr(f'{save_root}/denoised_data.zarr') chunks = denoised_data.shape[1:] num_t = denoised_data.shape[0] # save and compute reference image print('Compute reference image ---') if not os.path.exists(f'{save_root}/motion_fix_.h5'): med_win = len(denoised_data)//2 ref_img = denoised_data[med_win-50:med_win+50].mean(axis=0).compute() save_h5(f'{save_root}/motion_fix_.h5', ref_img, dtype='float16') print('--- Done computing reference image') # compute affine transform print('Registration to reference image ---') # create trans_affs file if not os.path.exists(f'{save_root}/trans_affs.npy'): ref_img = File(f'{save_root}/motion_fix_.h5', 'r')['default'].value ref_img = ref_img.max(axis=0, keepdims=True) if down_sample_registration==1: trans_affine = denoised_data.map_blocks(lambda x: estimate_rigid2d(x, fixed=ref_img), dtype='float32', drop_axis=(3), chunks=(1,4,4)).compute() else: #### downsample trans_affine case trans_affine = denoised_data[0::down_sample_registration].map_blocks(lambda x: estimate_rigid2d(x, fixed=ref_img), dtype='float32', drop_axis=(3), chunks=(1,4,4)).compute() len_dat = denoised_data.shape[0] trans_affine = rigid_interp(trans_affine, down_sample_registration, len_dat) # save trans_affs file np.save(f'{save_root}/trans_affs.npy', trans_affine) # load trans_affs file trans_affine_ = np.load(f'{save_root}/trans_affs.npy') trans_affine_ = da.from_array(trans_affine_, chunks=(1,4,4)) print('--- Done registration reference image') trans_data_ = da.map_blocks(apply_transform3d, denoised_data, trans_affine_, chunks=(1, *denoised_data.shape[1:]), dtype='float16') trans_data_t = trans_data_.rechunk((-1, 1, chunks[1]//nsplit[0], chunks[2]//nsplit[1])).transpose((1, 2, 3, 0)) trans_data_t.to_zarr(f'{save_root}/motion_corrected_data.zarr') fdask.terminate_workers(cluster, client) print('Remove temporal files of registration') if os.path.exists(f'{save_root}/denoised_data.zarr'): shutil.rmtree(f'{save_root}/denoised_data.zarr') return None
def xr_geomedian(ds, axis="time", where=None, **kw): """ :param ds: xr.Dataset|xr.DataArray|numpy array Other parameters: **kwargs -- passed on to pcm.gnmpcm maxiters : int 1000 eps : float 0.0001 num_threads: int| None None """ from hdstats import nangeomedian_pcm def norm_input(ds, axis): if isinstance(ds, xr.DataArray): xx = ds if len(xx.dims) != 4: raise ValueError("Expect 4 dimensions on input: y,x,band,time") if axis is not None and xx.dims[3] != axis: raise ValueError( f"Can only reduce last dimension, expect: y,x,band,{axis}") return None, xx, xx.data elif isinstance(ds, xr.Dataset): xx = reshape_for_geomedian(ds, axis) return ds, xx, xx.data else: # assume numpy or similar xx_data = ds if xx_data.ndim != 4: raise ValueError("Expect 4 dimensions on input: y,x,band,time") return None, None, xx_data kw.setdefault("nocheck", True) kw.setdefault("num_threads", 1) kw.setdefault("eps", 1e-6) ds, xx, xx_data = norm_input(ds, axis) is_dask = dask.is_dask_collection(xx_data) if where is not None: if is_dask: raise NotImplementedError( "Dask version doesn't support output masking currently") if where.shape != xx_data.shape[:2]: raise ValueError("Shape for `where` parameter doesn't match") set_nan = ~where else: set_nan = None if is_dask: if xx_data.shape[-2:] != xx_data.chunksize[-2:]: xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1)) data = da.map_blocks( lambda x: nangeomedian_pcm(x, **kw), xx_data, name=randomize("geomedian"), dtype=xx_data.dtype, drop_axis=3, ) else: data = nangeomedian_pcm(xx_data, **kw) if set_nan is not None: data[set_nan, :] = np.nan if xx is None: return data dims = xx.dims[:-1] cc = {k: xx.coords[k] for k in dims} xx_out = xr.DataArray(data, dims=dims, coords=cc) if ds is None: xx_out.attrs.update(xx.attrs) return xx_out ds_out = xx_out.to_dataset(dim="band") for b in ds.data_vars.keys(): src, dst = ds[b], ds_out[b] dst.attrs.update(src.attrs) return ds_out
def get_reflectance(self, sun_zenith, sat_zenith, azidiff, bandname, redband=None): """Get the reflectance from the three sun-sat angles""" # Get wavelength in nm for band: if isinstance(bandname, float): LOG.warning('A wavelength is provided instead of band name - ' + 'disregard the relative spectral responses and assume ' + 'it is the effective wavelength: %f (micro meter)', bandname) wvl = bandname * 1000.0 else: wvl = self.get_effective_wavelength(bandname) wvl = wvl * 1000.0 rayl, wvl_coord, azid_coord, satz_sec_coord, sunz_sec_coord = self.get_reflectance_lut() # force dask arrays compute = False if HAVE_DASK and not isinstance(sun_zenith, Array): compute = True sun_zenith = from_array(sun_zenith, chunks=sun_zenith.shape) sat_zenith = from_array(sat_zenith, chunks=sat_zenith.shape) azidiff = from_array(azidiff, chunks=azidiff.shape) if redband is not None: redband = from_array(redband, chunks=redband.shape) clip_angle = rad2deg(arccos(1. / sunz_sec_coord.max())) sun_zenith = clip(sun_zenith, 0, clip_angle) sunzsec = 1. / cos(deg2rad(sun_zenith)) clip_angle = rad2deg(arccos(1. / satz_sec_coord.max())) sat_zenith = clip(sat_zenith, 0, clip_angle) satzsec = 1. / cos(deg2rad(sat_zenith)) shape = sun_zenith.shape if not(wvl_coord.min() < wvl < wvl_coord.max()): LOG.warning( "Effective wavelength for band %s outside 400-800 nm range!", str(bandname)) LOG.info( "Set the rayleigh/aerosol reflectance contribution to zero!") if HAVE_DASK: chunks = sun_zenith.chunks if redband is None else redband.chunks res = zeros(shape, chunks=chunks) return res.compute() if compute else res else: return zeros(shape) idx = np.searchsorted(wvl_coord, wvl) wvl1 = wvl_coord[idx - 1] wvl2 = wvl_coord[idx] fac = (wvl2 - wvl) / (wvl2 - wvl1) raylwvl = fac * rayl[idx - 1, :, :, :] + (1 - fac) * rayl[idx, :, :, :] tic = time.time() smin = [sunz_sec_coord[0], azid_coord[0], satz_sec_coord[0]] smax = [sunz_sec_coord[-1], azid_coord[-1], satz_sec_coord[-1]] orders = [ len(sunz_sec_coord), len(azid_coord), len(satz_sec_coord)] f_3d_grid = atleast_2d(raylwvl.ravel()) if HAVE_DASK and isinstance(smin[0], Array): # compute all of these at the same time before passing to the interpolator # otherwise they are computed separately smin, smax, orders, f_3d_grid = da.compute(smin, smax, orders, f_3d_grid) minterp = MultilinearInterpolator(smin, smax, orders) minterp.set_values(f_3d_grid) if HAVE_DASK: ipn = map_blocks(self._do_interp, minterp, sunzsec, azidiff, satzsec, dtype=raylwvl.dtype, chunks=azidiff.chunks) else: ipn = self._do_interp(minterp, sunzsec, azidiff, satzsec) LOG.debug("Time - Interpolation: {0:f}".format(time.time() - tic)) ipn *= 100 res = ipn if redband is not None: res = where(redband < 20., res, (1 - (redband - 20) / 80) * res) res = clip(res, 0, 100) if compute: res = res.compute() return res
def Fst( ds: Dataset, *, estimator: Optional[str] = None, stat_divergence: Hashable = variables.stat_divergence, merge: bool = True, ) -> Dataset: """Compute Fst between pairs of cohorts. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. estimator Determines the formula to use for computing Fst. If None (the default), or ``Hudson``, Fst is calculated using the method of Hudson (1992) elaborated by Bhatia et al. (2013), (the same estimator as scikit-allel). Other supported estimators include ``Nei`` (1986), (the same estimator as tskit). stat_divergence Divergence variable to use or calculate. Defined by :data:`sgkit.variables.stat_divergence_spec`. If the variable is not present in ``ds``, it will be computed using :func:`divergence`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the Fst value between pairs of cohorts, as defined by :data:`sgkit.variables.stat_Fst_spec`. Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE array([[[ nan, -0.16666667], [-0.16666667, nan]], <BLANKLINE> [[ nan, -0.16666667], [-0.16666667, nan]], <BLANKLINE> [[ nan, -0.33333333], [-0.33333333, nan]], <BLANKLINE> [[ nan, -0.33333333], [-0.33333333, nan]], <BLANKLINE> [[ nan, 0.2 ], [ 0.2 , nan]]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE array([[[ nan, -0.22222222], [-0.22222222, nan]], <BLANKLINE> [[ nan, 0. ], [ 0. , nan]]]) """ known_estimators = {"Hudson": _Fst_Hudson, "Nei": _Fst_Nei} if estimator is not None and estimator not in known_estimators: raise ValueError( f"Estimator '{estimator}' is not a known estimator: {known_estimators.keys()}" ) estimator = estimator or "Hudson" ds = define_variable_if_absent(ds, variables.stat_divergence, stat_divergence, divergence) variables.validate(ds, {stat_divergence: variables.stat_divergence_spec}) n_cohorts = ds.dims["cohorts"] gs = da.asarray(ds.stat_divergence) shape = (gs.chunks[0], n_cohorts, n_cohorts) fst = da.map_blocks(known_estimators[estimator], gs, chunks=shape, dtype=np.float64) # TODO: reinstate assert (first dim could be either variants or windows) # assert_array_shape(fst, n_windows, n_cohorts, n_cohorts) new_ds = create_dataset( {variables.stat_Fst: (("windows", "cohorts_0", "cohorts_1"), fst)}) return conditional_merge_datasets(ds, new_ds, merge)
def xr_geomedian_tmad(ds, axis='time', where=None, **kw): """ :param ds: xr.Dataset|xr.DataArray|numpy array Other parameters: **kwargs -- passed on to pcm.gnmpcm maxiters : int 1000 eps : float 0.0001 num_threads: int| None None """ import hdstats def gm_tmad(arr, **kw): """ arr: a high dimensional numpy array where the last dimension will be reduced. returns: a numpy array with one less dimension than input. """ gm = hdstats.nangeomedian_pcm(arr, **kw) nt = kw.pop('num_threads', None) emad = hdstats.emad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis] smad = hdstats.smad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis] bcmad = hdstats.bcmad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis] return np.concatenate([gm, emad, smad, bcmad], axis=-1) def norm_input(ds, axis): if isinstance(ds, xr.DataArray): xx = ds if len(xx.dims) != 4: raise ValueError("Expect 4 dimensions on input: y,x,band,time") if axis is not None and xx.dims[3] != axis: raise ValueError(f"Can only reduce last dimension, expect: y,x,band,{axis}") return None, xx, xx.data elif isinstance(ds, xr.Dataset): xx = reshape_for_geomedian(ds, axis) return ds, xx, xx.data else: # assume numpy or similar xx_data = ds if xx_data.ndim != 4: raise ValueError("Expect 4 dimensions on input: y,x,band,time") return None, None, xx_data kw.setdefault('nocheck', False) kw.setdefault('num_threads', 1) kw.setdefault('eps', 1e-6) ds, xx, xx_data = norm_input(ds, axis) is_dask = dask.is_dask_collection(xx_data) if where is not None: if is_dask: raise NotImplementedError("Dask version doesn't support output masking currently") if where.shape != xx_data.shape[:2]: raise ValueError("Shape for `where` parameter doesn't match") set_nan = ~where else: set_nan = None if is_dask: if xx_data.shape[-2:] != xx_data.chunksize[-2:]: xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1)) data = da.map_blocks(lambda x: gm_tmad(x, **kw), xx_data, name=randomize('geomedian'), dtype=xx_data.dtype, chunks=xx_data.chunks[:-2] + (xx_data.chunks[-2][0]+3,), drop_axis=3) else: data = gm_tmad(xx_data, **kw) if set_nan is not None: data[set_nan, :] = np.nan if xx is None: return data dims = xx.dims[:-1] cc = {k: xx.coords[k] for k in dims} cc[dims[-1]] = np.hstack([xx.coords[dims[-1]].values,['edev', 'sdev', 'bcdev']]) xx_out = xr.DataArray(data, dims=dims, coords=cc) if ds is None: xx_out.attrs.update(xx.attrs) return xx_out ds_out = xx_out.to_dataset(dim='band') for b in ds.data_vars.keys(): src, dst = ds[b], ds_out[b] dst.attrs.update(src.attrs) return assign_crs(ds_out, crs=ds.geobox.crs)
# subtract mean axes = tuple(np.arange(b1.ndim, dtype=int)[b1.ndim//2:]) b1 -= b1.mean(axis=axes, keepdims=True) b2 -= b2.mean(axis=axes, keepdims=True) # numerator of corrcoef numerator = np.multiply(b1, b2).mean(axis=axes, keepdims=False) # denomenator of corrcoef dof = np.prod( b1.shape[slice(axes[0], axes[-1]+1)] ) b1_std = np.sqrt( (b1**2).mean(axis=axes, keepdims=False) / dof ) b2_std = np.sqrt( (b2**2).mean(axis=axes, keepdims=False) / dof ) denominator = np.multiply(b1_std, b2_std) # divide out = np.divide(numerator, denominator) return out if __name__ == '__main__': f1 = h5py.File("test.h5", "r") f2 = h5py.File("test2.h5", "r") arr1 = da.from_array(f1["arr"]) arr2 = da.from_array(f2["arr"]) block_shape = (10, 10) with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof,\ ProgressBar(): out = da.map_blocks(corrcoef, arr1, arr2, block_shape, chunks=(400, 400)) da.to_hdf5("out.h5", "/arr", out) visualize([prof, rprof])
os.chdir("/home/ubuntu/observations") new = xr.open_dataset("observations.nc") anomalies_obs = new.air[1320:1632, :, :] anomalies_obs = anomalies_obs.reindex(lat=new['lat'], lon=new['lon'], method='nearest') obs = anomalies_obs anomaly_obs_slopes = np.zeros( [anomalies_obs.lat.shape[0], anomalies_obs.lon.shape[0]]) anomalies_obs_da = da.from_array(anomalies_obs.data, chunks=[312, 45, 45]) anomaly_obs_slopes = da.map_blocks(my_linregress, anomalies_obs_da, dtype=np.ndarray, drop_axis=[0]) anomaly_obs_slopes = anomaly_obs_slopes.compute(num_workers=num_workers) print('Linear regression slopes:') print(anomaly_obs_slopes) print('Units are temperature change for each data point in degrees/year.') end1 = time.time() section1 = end1 - start1 print(section1) ###################################################################################################### ''' SECTION 2 - COMPUTING MODEL MEAN (1950-1980)
def map_overlap(func, *args, depth=None, boundary=None, trim=True, align_arrays=True, **kwargs): """ Map a function over blocks of arrays with some overlap We share neighboring zones between blocks of the array, map a function, and then trim away the neighboring strips. Parameters ---------- func: function The function to apply to each extended block args : dask arrays depth: int, tuple, dict or list The number of elements that each block should share with its neighbors If a tuple or dict then this can be different per axis. If a list then each element of that list must be an int, tuple or dict defining depth for the corresponding array in `args`. Asymmetric depths may be specified using a dict value of (-/+) tuples. Note that asymmetric depths are currently only supported when ``boundary`` is 'none'. The default value is 0. boundary: str, tuple, dict or list How to handle the boundaries. Values include 'reflect', 'periodic', 'nearest', 'none', or any constant value like 0 or np.nan. If a list then each element must be a str, tuple or dict defining the boundary for the corresponding array in `args`. The default value is 'reflect'. trim: bool Whether or not to trim ``depth`` elements from each block after calling the map function. Set this to False if your mapping function already does this for you align_arrays: bool Whether or not to align chunks along equally sized dimensions when multiple arrays are provided. This allows for larger chunks in some arrays to be broken into smaller ones that match chunk sizes in other arrays such that they are compatible for block function mapping. If this is false, then an error will be thrown if arrays do not already have the same number of blocks in each dimensions. **kwargs: Other keyword arguments valid in ``map_blocks`` Examples -------- >>> import numpy as np >>> import dask.array as da >>> x = np.array([1, 1, 2, 3, 3, 3, 2, 1, 1]) >>> x = da.from_array(x, chunks=5) >>> def derivative(x): ... return x - np.roll(x, 1) >>> y = x.map_overlap(derivative, depth=1, boundary=0) >>> y.compute() array([ 1, 0, 1, 1, 0, 0, -1, -1, 0]) >>> x = np.arange(16).reshape((4, 4)) >>> d = da.from_array(x, chunks=(2, 2)) >>> d.map_overlap(lambda x: x + x.size, depth=1).compute() array([[16, 17, 18, 19], [20, 21, 22, 23], [24, 25, 26, 27], [28, 29, 30, 31]]) >>> func = lambda x: x + x.size >>> depth = {0: 1, 1: 1} >>> boundary = {0: 'reflect', 1: 'none'} >>> d.map_overlap(func, depth, boundary).compute() # doctest: +NORMALIZE_WHITESPACE array([[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23], [24, 25, 26, 27]]) """ # Look for invocation using deprecated single-array signature # map_overlap(x, func, depth, boundary=None, trim=True, **kwargs) if isinstance(func, Array) and callable(args[0]): warnings.warn( "Detected use of signature map_overlap(x, func) rather than " "map_overlap(func, *args) for multi-array support. Arguments " "will be swapped in this case but such an exception will not " "be made in a future release.", FutureWarning, ) sig = ["func", "depth", "boundary", "trim"] depth = get(sig.index("depth"), args, depth) boundary = get(sig.index("boundary"), args, boundary) trim = get(sig.index("trim"), args, trim) func, args = args[0], [func] if not callable(func): raise TypeError("First argument must be callable function, not {}\n" "Usage: da.map_overlap(function, x)\n" " or: da.map_overlap(function, x, y, z)".format( type(func).__name__)) if not all(isinstance(x, Array) for x in args): raise TypeError("All variadic arguments must be arrays, not {}\n" "Usage: da.map_overlap(function, x)\n" " or: da.map_overlap(function, x, y, z)".format( [type(x).__name__ for x in args])) # Coerce depth and boundary arguments to lists of individual # specifications for each array argument def coerce(xs, arg, fn): if not isinstance(arg, list): arg = [arg] * len(xs) return [fn(x.ndim, a) for x, a in zip(xs, arg)] depth = coerce(args, depth, coerce_depth) boundary = coerce(args, boundary, coerce_boundary) # Align chunks in each array to a common size if align_arrays: # Reverse unification order to allow block broadcasting inds = [list(reversed(range(x.ndim))) for x in args] _, args = da.core.unify_chunks(*list(concat(zip(args, inds))), warn=False) for i, x in enumerate(args): for j in range(x.ndim): if isinstance(depth[i][j], tuple) and boundary[i][j] != "none": raise NotImplementedError( "Asymmetric overlap is currently only implemented " "for boundary='none', however boundary for dimension " "{} in array argument {} is {}".format( j, i, boundary[i][j])) def assert_int_chunksize(xs): assert all(type(c) is int for x in xs for cc in x.chunks for c in cc) assert_int_chunksize(args) args = [ overlap(x, depth=d, boundary=b) for x, d, b in zip(args, depth, boundary) ] assert_int_chunksize(args) x = da.map_blocks(func, *args, **kwargs) assert_int_chunksize([x]) if trim: # Find index of array argument with maximum rank and break ties by choosing first provided i = sorted(enumerate(args), key=lambda v: (v[1].ndim, -v[0]))[-1][0] # Trim using depth/boundary setting for array of highest rank return trim_internal(x, depth[i], boundary[i]) else: return x
def run( self, pars, sims, sim_status, indices, collect_in_memory: bool = True, batch_size: Optional[int] = None, ): """Run the simulator on the input parameters. Args: pars: array with all the input parameters. Should have shape (num. samples, num. parameters) sims: dictionary of arrays where to store the simulation output. All arrays should have the number of samples as the size of the first dimension sim_status: array where to store the simulation status (size should be equal to the number of samples) indices: indices of the samples that need to be run by the simulator collect_in_memory: if True, collect the simulation output in memory; if False, instruct Dask workers to save the output to the corresponding arrays. The latter option is asynchronous, thus this method immediately returns. batch_size: simulations will be submitted in batches of the specified size """ self.set_dask_cluster(self.cluster) # open parameter array as Dask array chunks = getattr(pars, "chunks", "auto") z = da.from_array(pars, chunks=chunks) idx = da.from_array(indices, chunks=(batch_size or -1, )) z = z[idx] z = z.persist() # load the parameters in the distributed memory # block-wise run the model function on the parameter array out = da.map_blocks( _run_model_chunk, z, model=self.model, sim_shapes=self.sim_shapes, fail_on_non_finite=self.fail_on_non_finite, drop_axis=1, dtype=np.object, ) # FIXME: Deprecated? # print("Simulator: Running...") # bag = db.from_sequence(z, npartitions=npartitions) # bag = bag.map(_run_one_sample, self.model, self.fail_on_non_finite) # result = bag.compute(scheduler=self.client or "processes") # print("Simulator: ...done.") # return result # split result dictionary and simulation status array results = out.map_blocks(getitem, 0, dtype=np.object) status = out.map_blocks(getitem, 1, meta=np.array(()), dtype=np.int) # unpack array of dictionaries to dictionary of arrays result_dict = {} for obs, shape in self.sim_shapes.items(): result_dict[obs] = results.map_blocks( getitem, obs, new_axis=[i + 1 for i in range(len(shape))], chunks=(z.chunks[0], *shape), meta=np.array(()), dtype=np.float, ) sources = [result_dict[k] for k in self.sim_shapes.keys()] targets = [sims[k] for k in self.sim_shapes.keys()] if collect_in_memory: # submit computation and collect results *sources, status = self.client.compute([*sources, status], sync=True) # update simulation results for source, target in zip(sources, targets): target[indices.tolist()] = source # finally, update the simulation status sim_status[indices.tolist()] = status else: sources = da.store( sources=sources, targets=targets, regions=(indices.tolist(), ), lock=False, compute=False, return_stored=True, ) # submit computation *sources, status = self.client.persist([*sources, status]) # the following dummy array is generated after results are stored. zeros_when_done = [ source.map_blocks( lambda x: np.zeros(x.shape[0], dtype=np.int), chunks=(source.chunks[0], ), drop_axis=[i for i in range(1, source.ndim)], meta=np.array((), dtype=np.int), dtype=np.int, ) for source in sources ] status = sum([*zeros_when_done, status]) status = status.store( target=sim_status, regions=(indices.tolist(), ), lock=False, compute=False, return_stored=True, ) # when the simulation results are stored, we can update the status status = self.client.persist(status) fire_and_forget(status)
def int_geomedian(ds, scale=1, offset=0, wk_rows=-1, as_array=False, **kw): """ds -- xr.Dataset (possibly dask) with dims: (time, y, x) for each band on output time dimension is removed :param ds: Dataset with int data variables :param scale: Normalize data for running computation (output is scaled back to original values) :param offset: ``(x*scale + offset)`` :param wk_rows: reduce memory requirements by processing that many rows of a chunk at a time :param as_array: If set to True return DataArray with band dimension instead of Dataset :param kw: Passed on to hdstats (eps=1e-4, num_threads=1, maxiters=10_000, nocheck=True) """ band_names = [dv.name for dv in ds.data_vars.values()] xx, *_ = ds.data_vars.values() nodata = getattr(xx, "nodata", None) is_dask = dask.is_dask_collection(xx) if is_dask: if xx.data.chunksize[0] != xx.shape[0]: ds = ds.chunk(chunks={xx.dims[0]: -1}) xx, *_ = ds.data_vars.values() nt, ny, nx = xx.shape bands = [dv.data for dv in ds.data_vars.values()] band = bands[0] nb = len(bands) dtype = band.dtype kw.setdefault("nocheck", True) kw.setdefault("num_threads", 1) kw.setdefault("eps", 1e-4) kw.setdefault("maxiters", 10_000) if is_dask: chunks = ((nb, ), *xx.chunks[1:]) data = da.map_blocks( int_geomedian_np, *bands, nodata=nodata, scale=scale, offset=offset, wk_rows=wk_rows, **kw, name=randomize("geomedian"), dtype=dtype, chunks=chunks, drop_axis=[0], # time is dropped new_axis=[0], ) # band is added on the left else: data = int_geomedian_np(*bands, nodata=nodata, scale=scale, offset=offset, wk_rows=wk_rows, **kw) dims = ("band", *xx.dims[1:]) cc = {k: xx.coords[k] for k in dims[1:]} cc["band"] = band_names da_out = xr.DataArray(data, dims=dims, coords=cc) if as_array: if nodata is not None: da_out.attrs["nodata"] = nodata return da_out ds_out = da_out.to_dataset(dim="band") ds_out.attrs.update(ds.attrs) for b in ds.data_vars.keys(): src, dst = ds[b], ds_out[b] dst.attrs.update(src.attrs) return ds_out
def geomedian_with_mads( src: Union[xr.Dataset, xr.DataArray], compute_mads: bool = True, compute_count: bool = True, out_chunks: Optional[Tuple[int, int, int]] = None, reshape_strategy: str = "mem", scale: float = 1.0, offset: float = 0.0, eps: Optional[float] = None, maxiters: int = 1000, num_threads: int = 1, **kw, ) -> xr.Dataset: """ Compute Geomedian on Dask backed Dataset. NOTE: Default configuration of this code assumes that entire input can be loaded in to RAM on the Dask worker. It also assumes that there is only one worker in the cluster, or that entire task will get scheduled on one single worker only. See ``reshape_strategy`` parameter. :param src: xr.Dataset or a single array in YXBT order, bands can be either float or integer with `nodata` values to indicate gaps in data. :param compute_mads: Whether to compute smad,emad,bcmad statistics :param compute_count: Whether to compute count statistic (number of contributing observations per output pixels) :param out_chunks: Advanced option, allows to rechunk output internally, order is ``(ny, nx, nband)`` :param reshape_strategy: One of ``mem`` (default) or ``yxbt``. This is only applicable when supplying Dataset object. It controls how Dataset is reshaped into DataArray in the format expected by Geomedian code. If you have enough RAM and use single-worker Dask cluster, then use ``mem``, it should be the most efficient. If there is not enough RAM to load entire input you can try ``yxbt`` mode, but you might still run out of RAM anyway. If using multi-worker Dask cluster you have to use ``yxbt`` strategy. :param scale, offset: Only used when input contains integer values, actual Geomedian will run on scaled values ``scale*X+offset``. Only affects internal computation, final result is scaled back to the original value range. :param eps: Termination criteria passed on to geomedian algorithm :param maxiters: Maximum number of iterations done per output pixel :param num_threads: Configure internal concurrency of the Geomedian computation. Default is 1 as we assume that Dask will run a bunch of those concurrently. :param work_chunks: Default is ``(100, 100)``, only applicable when input is Dataset. """ if not dask.is_dask_collection(src): raise ValueError("This method only works on Dask inputs") if isinstance(src, xr.DataArray): yxbt = src else: # TODO: better automatic defaults for work_chunks ny, nx = kw.get("work_chunks", (100, 100)) if reshape_strategy == "mem": yxbt = yxbt_sink(src, (ny, nx, -1, -1)) elif reshape_strategy == "yxbt": yxbt = reshape_yxbt(src, yx_chunks=(ny, nx)) else: raise ValueError( f"Reshape strategy '{reshape_strategy}' not understood use one of: mem or yxbt" ) ny, nx, nb, nt = yxbt.shape nodata = yxbt.attrs.get("nodata", None) assert yxbt.chunks is not None if yxbt.data.numblocks[2:4] != (1, 1): raise ValueError( "There should be one dask block along time and band dimension") n_extras = (3 if compute_mads else 0) + (1 if compute_count else 0) chunks = (*yxbt.chunks[:2], (nb + n_extras, )) is_float = yxbt.dtype.kind == "f" if eps is None: eps = 1e-4 if is_float else 0.1 * scale op = functools.partial( _gm_mads_compute_f32, compute_mads=compute_mads, compute_count=compute_count, nodata=nodata, scale=scale, offset=offset, eps=eps, maxiters=maxiters, num_threads=num_threads, ) _gm = da.map_blocks(op, yxbt.data, dtype="float32", drop_axis=3, chunks=chunks, name="geomedian") if out_chunks is not None: _gm = _gm.rechunk(out_chunks) gm_data = _gm[:, :, :nb] if not is_float: gm_data = da.map_blocks( lambda x: from_float_np( x, yxbt.dtype, nodata, scale=1 / scale, offset=offset / scale), gm_data, dtype=yxbt.dtype, ) dims = yxbt.dims[:3] coords = {k: yxbt.coords[k] for k in dims} result = xr.DataArray(data=gm_data, dims=dims, coords=coords, attrs=yxbt.attrs).to_dataset("band") for dv in result.data_vars.values(): dv.attrs.update(yxbt.attrs) next_stat = nb if compute_mads: smad = _gm[:, :, next_stat + 0] emad = _gm[:, :, next_stat + 1] bcmad = _gm[:, :, next_stat + 2] next_stat += 3 if not is_float: emad = emad * (1 / scale) result["smad"] = xr.DataArray(data=smad, dims=dims[:2], coords=result.coords) result["emad"] = xr.DataArray(data=emad, dims=dims[:2], coords=result.coords) result["bcmad"] = xr.DataArray(data=bcmad, dims=dims[:2], coords=result.coords) if compute_count: count = _gm[:, :, next_stat].astype("uint16") next_stat += 1 result["count"] = xr.DataArray(data=count, dims=dims[:2], coords=result.coords) return result
def run_crefl(refl, coeffs, lon, lat, sensor_azimuth, sensor_zenith, solar_azimuth, solar_zenith, avg_elevation=None, percent=False, use_abi=False): """Run main crefl algorithm. All input parameters are per-pixel values meaning they are the same size and shape as the input reflectance data, unless otherwise stated. :param reflectance_bands: tuple of reflectance band arrays :param coefficients: tuple of coefficients for each band (see `get_coefficients`) :param lon: input swath longitude array :param lat: input swath latitude array :param sensor_azimuth: input swath sensor azimuth angle array :param sensor_zenith: input swath sensor zenith angle array :param solar_azimuth: input swath solar azimuth angle array :param solar_zenith: input swath solar zenith angle array :param avg_elevation: average elevation (usually pre-calculated and stored in CMGDEM.hdf) :param percent: True if input reflectances are on a 0-100 scale instead of 0-1 scale (default: False) """ # FUTURE: Find a way to compute the average elevation before hand # Get digital elevation map data for our granule, set ocean fill value to 0 if avg_elevation is None: LOG.debug("No average elevation information provided in CREFL") #height = np.zeros(lon.shape, dtype=np.float) height = 0. else: LOG.debug("Using average elevation information provided to CREFL") lat[(lat <= -90) | (lat >= 90)] = np.nan lon[(lon <= -180) | (lon >= 180)] = np.nan row = ((90.0 - lat) * avg_elevation.shape[0] / 180.0).astype(np.int32) col = ((lon + 180.0) * avg_elevation.shape[1] / 360.0).astype(np.int32) space_mask = da.isnull(lon) | da.isnull(lat) row[space_mask] = 0 col[space_mask] = 0 def _avg_elevation_index(avg_elevation, row, col): return avg_elevation[row, col] height = da.map_blocks(_avg_elevation_index, avg_elevation, row, col, dtype=avg_elevation.dtype) height = xr.DataArray(height, dims=['y', 'x']) # negative heights aren't allowed, clip to 0 height = height.where((height >= 0.) & ~space_mask, 0.0) del lat, lon, row, col mus = da.cos(da.deg2rad(solar_zenith)) mus = mus.where(mus >= 0) muv = da.cos(da.deg2rad(sensor_zenith)) phi = solar_azimuth - sensor_azimuth if use_abi: LOG.debug("Using ABI CREFL algorithm") a_O3 = [268.45, 0.5, 115.42, -3.2922] a_H2O = [0.0311, 0.1, 92.471, -1.3814] a_O2 = [0.4567, 0.007, 96.4884, -1.6970] G_O3 = G_calc(solar_zenith, a_O3) + G_calc(sensor_zenith, a_O3) G_H2O = G_calc(solar_zenith, a_H2O) + G_calc(sensor_zenith, a_H2O) G_O2 = G_calc(solar_zenith, a_O2) + G_calc(sensor_zenith, a_O2) # Note: bh2o values are actually ao2 values for abi sphalb, rhoray, TtotraytH2O, tOG = get_atm_variables_abi( mus, muv, phi, height, G_O3, G_H2O, G_O2, *coeffs) else: LOG.debug("Using original VIIRS CREFL algorithm") sphalb, rhoray, TtotraytH2O, tOG = get_atm_variables( mus, muv, phi, height, *coeffs) del solar_azimuth, solar_zenith, sensor_zenith, sensor_azimuth # Note: Assume that fill/invalid values are either NaN or we are dealing # with masked arrays if percent: corr_refl = ((refl / 100.) / tOG - rhoray) / TtotraytH2O else: corr_refl = (refl / tOG - rhoray) / TtotraytH2O corr_refl /= (1.0 + corr_refl * sphalb) return corr_refl.clip(REFLMIN, REFLMAX)
def convert_to_pfts(category_cube, conversion, min_category, max_category): """Convert landcover categories to PFT fractions using a given conversion table. Args: category_cube (iris.cube.Cube): Cube containing the landcover categories. conversion (dict): Conversion factors from categories to PFT fractions. min_category (int): Minimum possible land cover category index (inclusive). max_category (int): Maximum possible land cover category index (inclusive). Returns: iris.cube.CubeList: Cubes containing the PFTs on the same grid as `category_cube`. """ if not category_cube.has_lazy_data(): raise ValueError("Source cube needs to have lazy data.") pft_names = get_mapping_pfts(conversion) array_mapping = get_mapping_arrays(pft_names, conversion) n_pfts = next(iter(array_mapping.values()))["pfts"].size if not all(values["pfts"].size == n_pfts for values in array_mapping.values()): raise ValueError( "All categories need to map on to the same number of PFT fractions." ) # Simple array structure containing the mapping from landcover categories to PFTs in a # way that is easier to accelerate. structured_mapping = np.zeros((max_category - min_category + 1, n_pfts), dtype=np.uint8) for landcover_index in range(min_category, max_category + 1): if landcover_index in array_mapping: structured_mapping[landcover_index] = array_mapping[ landcover_index]["pfts"] else: structured_mapping[landcover_index] = np.zeros(n_pfts, dtype=np.uint8) @parallel_njit def _execute_mapping(category, structured_mapping, n_pfts): """Carry out conversion to PFT fractions.""" pfts = np.zeros((*category.shape, *(n_pfts, ))) for index in np.ndindex(category.shape): pfts[index] = structured_mapping[category[index]] return pfts pft_data = da.map_blocks( _execute_mapping, category_cube.core_data(), structured_mapping=structured_mapping, n_pfts=n_pfts, meta=np.array([], dtype=np.uint8), # We are only adding a dimension with size `n_pfts`. All other chunks remain. chunks=(*category_cube.core_data().chunks, (n_pfts, )), new_axis=category_cube.ndim, dtype=np.uint8, ) cubes = iris.cube.CubeList() for i, pft_name in enumerate(pft_names): pft_cube = category_cube.copy(data=pft_data[..., i]) pft_cube.var_name = None pft_cube.standard_name = None pft_cube.long_name = pft_name pft_cube.units = "1" cubes.append(pft_cube) return cubes
def linear_regression(XL: ArrayLike, XC: ArrayLike, Y: ArrayLike) -> LinearRegressionResult: """Efficient linear regression estimation for multiple covariate sets Parameters ---------- XL [array-like, shape: (M, N)] "Loop" covariates for which N separate regressions will be run XC [array-like, shape: (M, P)] "Core" covariates included in the regressions for each loop covariate. All P core covariates are used in each of the N loop covariate regressions. Y [array-like, shape: (M, O)] Continuous outcomes Returns ------- Dataclass containing: beta : [array-like, shape: (N, O)] Beta values associated with each loop covariate and outcome t_value : [array-like, shape: (N, O)] T statistics for each beta p_value : [array-like, shape: (N, O)] P values as float in [0, 1] """ XL, XC = da.asarray(XL), da.asarray(XC) # Coerce for `lstsq` if set([x.ndim for x in [XL, XC, Y]]) != {2}: raise ValueError("All arguments must be 2D") n_core_covar, n_loop_covar, n_obs, n_outcome = ( XC.shape[1], XL.shape[1], Y.shape[0], Y.shape[1], ) dof = n_obs - n_core_covar - 1 if dof < 1: raise ValueError( "Number of observations (N) too small to calculate sampling statistics. " "N must be greater than number of core covariates (C) plus one. " f"Arguments provided: N={n_obs}, C={n_core_covar}.") # Apply orthogonal projection to eliminate core covariates # Note: QR factorization or SVD should be used here to find # what are effectively OLS residuals rather than matrix inverse # to avoid need for MxM array; additionally, dask.lstsq fails # with numpy arrays XLP = XL - da.dot(XC, da.linalg.lstsq(XC, XL)[0]) assert XLP.shape == (n_obs, n_loop_covar) YP = Y - da.dot(XC, da.linalg.lstsq(XC, Y)[0]) assert YP.shape == (n_obs, n_outcome) # Estimate coefficients for each loop covariate # Note: A key assumption here is that 0-mean residuals # from projection require no extra terms in variance # estimate for loop covariates (columns of G), which is # only true when an intercept is present. XLPS = (XLP**2).sum(axis=0, keepdims=True).T assert XLPS.shape == (n_loop_covar, 1) B = da.dot(XLP.T, YP) / XLPS assert B.shape == (n_loop_covar, n_outcome) # Compute residuals for each loop covariate and outcome separately YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...] assert YR.shape == (n_obs, n_loop_covar, n_outcome) RSS = (YR**2).sum(axis=0) assert RSS.shape == (n_loop_covar, n_outcome) # Get t-statistics for coefficient estimates T = B / np.sqrt(RSS / dof / XLPS) assert T.shape == (n_loop_covar, n_outcome) # Match to p-values # Note: t dist not implemented in Dask so this must be delayed, # see https://github.com/dask/dask/issues/6857 P = da.map_blocks(lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64") assert P.shape == (n_loop_covar, n_outcome) return LinearRegressionResult(beta=B, t_value=T, p_value=P)
def map_blocks(func, *args, **kwargs): array = args[0] if isinstance(array, da.Array): return da.map_blocks(func, *args, **kwargs) else: return func(*args)
def _ridge_regression_cv( X: Array, Y: Array, alphas: NDArray, n_zero_reg: Optional[int] = None ) -> Tuple[Array, Array, Array, Array]: assert alphas.ndim == 1 assert X.ndim == 2 assert Y.ndim == 2 assert X.numblocks[1] == 1 assert Y.numblocks[1] == 1 assert X.chunks[0] == Y.chunks[0] n_block, n_obs, n_covar, n_outcome, n_alpha = ( X.numblocks[0], X.shape[0], X.shape[1], Y.shape[1], alphas.shape[0], ) obs_chunks = X.chunks[0] # Project samples and outcomes noting that resulting chunks are # of fixed size even if the chunks along the observation dim # are not uniform (i.e. |X.chunks[0]| != 1) XtX = stack(da.map_blocks(lambda x: x.T @ x, X, chunks=(X.shape[1],) * 2)) assert_block_shape(XtX, n_block, 1, 1) assert_chunk_shape(XtX, 1, n_covar, n_covar) XtY = stack(da.map_blocks(lambda x, y: x.T @ y, X, Y, chunks=(n_covar, n_outcome))) assert_block_shape(XtY, n_block, 1, 1) assert_chunk_shape(XtY, 1, n_covar, n_outcome) # Invert the projections in each block so that each # contains data from all other blocks *except* itself XtX = unstack(XtX.sum(axis=0) - XtX) assert_block_shape(XtX, n_block, 1) assert_chunk_shape(XtX, n_covar, n_covar) XtY = unstack(XtY.sum(axis=0) - XtY) assert_block_shape(XtY, n_block, 1) assert_chunk_shape(XtY, n_covar, n_outcome) assert XtX.numblocks == XtY.numblocks # Regress for all outcomes/alphas and add new axis for ridge parameters B = da.map_blocks( ridge_regression, XtX, XtY, chunks=(n_alpha, n_covar, n_outcome), new_axis=[0], alphas=alphas, n_zero_reg=n_zero_reg, meta=da.utils.meta_from_array(XtX), ) assert_block_shape(B, 1, n_block, 1) assert_chunk_shape(B, n_alpha, n_covar, n_outcome) assert_array_shape(B, n_alpha, n_block * n_covar, n_outcome) # Generate predictions for all outcomes/alphas assert B.numblocks == (1,) + X.numblocks YP = da.map_blocks( lambda x, b: x @ b, X, B, chunks=(alphas.size, obs_chunks, n_outcome) ) assert_block_shape(YP, 1, n_block, 1) assert_chunk_shape(YP, n_alpha, obs_chunks[0], n_outcome) assert_array_shape(YP, n_alpha, n_obs, n_outcome) return XtX, XtY, B, YP
def divergence( ds: Dataset, *, cohort_allele_count: Hashable = variables.cohort_allele_count, merge: bool = True, ) -> Dataset: """Compute divergence between pairs of cohorts. The entry at (i, j) is the divergence between for cohort i and cohort j, except for the case where i and j are the same, in which case the entry is the diversity for cohort i. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. cohort_allele_count Cohort allele count variable to use or calculate. Defined by :data:`sgkit.variables.cohort_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_cohort_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the divergence value between pairs of cohorts, as defined by :data:`sgkit.variables.stat_divergence_spec`. Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE array([[[0.5 , 0.5 ], [0.5 , 0.66666667]], <BLANKLINE> [[0.66666667, 0.5 ], [0.5 , 0.5 ]], <BLANKLINE> [[0.66666667, 0.5 ], [0.5 , 0.66666667]], <BLANKLINE> [[0.5 , 0.375 ], [0.375 , 0.5 ]], <BLANKLINE> [[0.5 , 0.625 ], [0.625 , 0.5 ]]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE array([[[1.83333333, 1.5 ], [1.5 , 1.83333333]], <BLANKLINE> [[1. , 1. ], [1. , 1. ]]]) """ ds = define_variable_if_absent(ds, variables.cohort_allele_count, cohort_allele_count, count_cohort_alleles) variables.validate( ds, {cohort_allele_count: variables.cohort_allele_count_spec}) ac = ds[cohort_allele_count] n_variants = ds.dims["variants"] n_cohorts = ds.dims["cohorts"] ac = da.asarray(ac) shape = (ac.chunks[0], n_cohorts, n_cohorts) d = da.map_blocks(_divergence, ac, chunks=shape, dtype=np.float64) assert_array_shape(d, n_variants, n_cohorts, n_cohorts) if has_windows(ds): div = window_statistic( d, np.sum, ds.window_start.values, ds.window_stop.values, dtype=d.dtype, axis=0, ) new_ds = create_dataset({ variables.stat_divergence: ( ("windows", "cohorts_0", "cohorts_1"), div, ) }) else: new_ds = create_dataset({ variables.stat_divergence: ( ("variants", "cohorts_0", "cohorts_1"), d, ) }) return conditional_merge_datasets(ds, new_ds, merge)
def _stage_2( YP: Array, X: Array, Y: Array, alphas: Optional[NDArray] = None, normalize: bool = True, _glow_adj_alpha: bool = False, _glow_adj_scaling: bool = False, ) -> Tuple[Array, Array]: """Stage 2 - WGR Meta Regression This stage will train separate ridge regression models for each outcome using the predictions from stage 1 for that same outcome as features. These predictions are then evaluated based on R2 score to determine an optimal "meta" estimator (see `_stage_1` for the "base" estimator description). Results then include only predictions and coefficients from this optimal model. For more details, see the level 1 regression model described in step 1 of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2). """ assert YP.ndim == 4 assert X.ndim == 2 assert Y.ndim == 2 # Check that chunking across samples is the same for all arrays assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0] assert YP.chunks[2] == X.chunks[0] == Y.chunks[0] # Assert single chunks for covariates and outcomes assert X.numblocks[1] == Y.numblocks[1] == 1 # Extract shape statistics n_variant_block, n_alpha_1 = YP.shape[:2] n_sample_block = Y.numblocks[0] n_sample, n_outcome = Y.shape n_covar = X.shape[1] n_indvar = n_covar + n_variant_block * n_alpha_1 sample_chunks = Y.chunks[0] if normalize: assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1) assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome) # See: https://github.com/projectglow/glow/issues/260 if _glow_adj_scaling: YP = da.map_blocks( lambda x: (x - x.mean(axis=2, keepdims=True)) / x.std(axis=2, keepdims=True), YP, ) else: YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True) # Tranpose for refit on level 1 predictions YP = YP.transpose((3, 2, 0, 1)) assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1) if alphas is None: # See: https://github.com/projectglow/glow/issues/255 if _glow_adj_alpha: alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome) else: alphas = get_alphas(n_variant_block * n_alpha_1) n_alpha_2 = alphas.size YR = [] BR = [] for i in range(n_outcome): # Slice and reshape to new 2D covariate matrix; # The order of raveling in trailing dimensions is important # and later reshapes will assume variants, alphas order XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1)) # Prepend covariates and chunk along first dim only XPB = da.concatenate((X, XPB), axis=1) XPB = XPB.rechunk(chunks=(None, -1)) assert_array_shape(XPB, n_sample, n_indvar) assert XPB.numblocks == (n_sample_block, 1) # Extract outcome vector YB = Y[:, [i]] assert XPB.ndim == YB.ndim == 2 # Fit and predict folds for each parameter BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:] assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1) assert_array_shape(YPB, n_alpha_2, n_sample, 1) BR.append(BB) YR.append(YPB) # Concatenate predictions along outcome dimension YR = da.concatenate(YR, axis=2) assert_block_shape(YR, 1, n_sample_block, n_outcome) assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1) assert_array_shape(YR, n_alpha_2, n_sample, n_outcome) # Move samples to last dim so all others are batch # dims for R2 calculations YR = da.transpose(YR, (0, 2, 1)) assert_array_shape(YR, n_alpha_2, n_outcome, n_sample) YR = YR.rechunk((-1, -1, None)) assert_block_shape(YR, 1, 1, n_sample_block) assert YR.shape[1:] == Y.T.shape # Concatenate betas along outcome dimension BR = da.concatenate(BR, axis=2) assert_block_shape(BR, 1, n_sample_block, n_outcome) assert_chunk_shape(BR, n_alpha_2, n_indvar, 1) assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome) # Compute R2 scores within each sample block for each outcome + alpha R2 = da.stack( [ r2_score(YR.blocks[..., i], Y.T.blocks[..., i]) # Avoid warnings on R2 calculations for blocks with single rows if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan) for i in range(n_sample_block) ] ) assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome) # Coerce to finite or nan before nan-aware mean R2 = da.where(da.isfinite(R2), R2, np.nan) # Find highest mean alpha score for each outcome across blocks R2M = da.nanmean(R2, axis=0) assert_array_shape(R2M, n_alpha_2, n_outcome) # Identify index for the alpha value with the highest mean score R2I = da.argmax(R2M, axis=0) assert_array_shape(R2I, n_outcome) # Choose the predictions corresponding to the model with best score YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1) YRM = YRM.rechunk((None, -1)) assert_block_shape(YRM, n_sample_block, 1) assert_chunk_shape(YRM, sample_chunks[0], n_outcome) assert_array_shape(YRM, n_sample, n_outcome) # Choose the betas corresponding to the model with the best score BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1) BRM = BRM.rechunk((None, -1)) assert_block_shape(BRM, n_sample_block, 1) assert_chunk_shape(BRM, n_indvar, n_outcome) assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome) return BRM, YRM
def pbs( ds: Dataset, *, stat_Fst: Hashable = variables.stat_Fst, cohorts: Optional[Sequence[Union[Tuple[int, int, int], Tuple[str, str, str]]]] = None, merge: bool = True, ) -> Dataset: """Compute the population branching statistic (PBS) between cohort triples. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. stat_Fst Fst variable to use or calculate. Defined by :data:`sgkit.variables.stat_Fst_spec`. If the variable is not present in ``ds``, it will be computed using :func:`Fst`. cohorts The cohort triples to compute statistics for, specified as a sequence of tuples of cohort indexes or IDs. None (the default) means compute statistics for all cohorts. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the PBS value between cohort triples, as defined by :data:`sgkit.variables.stat_pbs_spec`. Shape (variants, cohorts, cohorts, cohorts), or (windows, cohorts, cohorts, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=6) >>> # Divide samples into three named cohorts >>> n_cohorts = 3 >>> sample_cohort = np.repeat(range(n_cohorts), ds.dims["samples"] // n_cohorts) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> cohort_names = [f"co_{i}" for i in range(n_cohorts)] >>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names}) >>> # Divide into two windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE array([ 0. , -0.160898]) """ ds = define_variable_if_absent(ds, variables.stat_Fst, stat_Fst, Fst) variables.validate(ds, {stat_Fst: variables.stat_Fst_spec}) fst = ds[variables.stat_Fst] fst = fst.clip(min=0, max=(1 - np.finfo(float).epsneg)) t = -np.log(1 - fst) n_cohorts = ds.dims["cohorts"] n_windows = ds.dims["windows"] assert_array_shape(t, n_windows, n_cohorts, n_cohorts) # calculate PBS triples t = da.asarray(t) shape = (t.chunks[0], n_cohorts, n_cohorts, n_cohorts) cohorts = cohorts or list(itertools.combinations(range(n_cohorts), 3)) # type: ignore ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts_0", None)) p = da.map_blocks(lambda t: _pbs_cohorts(t, ct), t, chunks=shape, new_axis=3, dtype=np.float64) assert_array_shape(p, n_windows, n_cohorts, n_cohorts, n_cohorts) new_ds = create_dataset({ variables.stat_pbs: (["windows", "cohorts_0", "cohorts_1", "cohorts_2"], p) }) return conditional_merge_datasets(ds, new_ds, merge)
def hardy_weinberg_test(ds: Dataset, *, genotype_counts: Optional[Hashable] = None, ploidy: Optional[int] = None, alleles: Optional[int] = None, merge: bool = True) -> Dataset: """Exact test for HWE as described in Wigginton et al. 2005 [1]. Parameters ---------- ds Dataset containing genotype calls or precomputed genotype counts. genotype_counts Name of variable containing precomputed genotype counts, by default None. If not provided, these counts will be computed automatically from genotype calls. If present, must correspond to an (`N`, 3) array where `N` is equal to the number of variants and the 3 columns contain heterozygous, homozygous reference, and homozygous alternate counts (in that order) across all samples for a variant. ploidy Genotype ploidy, defaults to ``ploidy`` dimension of provided dataset. If the `ploidy` dimension is not present, then this value must be set explicitly. Currently HWE calculations are only supported for diploid datasets, i.e. ``ploidy`` must equal 2. alleles Genotype allele count, defaults to ``alleles`` dimension of provided dataset. If the `alleles` dimension is not present, then this value must be set explicitly. Currently HWE calculations are only supported for biallelic datasets, i.e. ``alleles`` must equal 2. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Warnings -------- This function is only applicable to diploid, biallelic datasets. Returns ------- Dataset containing (N = num variants): variant_hwe_p_value : [array-like, shape: (N, O)] P values from HWE test for each variant as float in [0, 1]. References ---------- - [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005. “A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of Human Genetics 76 (5): 887–93. Raises ------ NotImplementedError If ploidy of provided dataset != 2 NotImplementedError If maximum number of alleles in provided dataset != 2 """ ploidy = ploidy or ds.dims.get("ploidy") if not ploidy: raise ValueError( "`ploidy` parameter must be set when not present as dataset dimension." ) if ploidy != 2: raise NotImplementedError( "HWE test only implemented for diploid genotypes") alleles = alleles or ds.dims.get("alleles") if not alleles: raise ValueError( "`alleles` parameter must be set when not present as dataset dimension." ) if alleles != 2: raise NotImplementedError( "HWE test only implemented for biallelic genotypes") # Use precomputed genotype counts if provided if genotype_counts is not None: variables.validate(ds, {genotype_counts: variables.genotype_counts_spec}) obs = list(da.asarray(ds[genotype_counts]).T) # Otherwise compute genotype counts from calls else: ds = count_genotypes(ds, dim="samples") obs = [ da.asarray(ds[v]) for v in ["variant_n_het", "variant_n_hom_ref", "variant_n_hom_alt"] ] p = da.map_blocks(hardy_weinberg_p_value_vec_jit, *obs) new_ds = create_dataset({variables.variant_hwe_p_value: ("variants", p)}) return conditional_merge_datasets(ds, new_ds, merge)