Ejemplo n.º 1
0
    def func(a, axes=None):
        if len(axes) > 3:
            raise ValueError("Detrending is only supported up to "
                            "3 dimensions.")
        if axes is None:
            axes = tuple(range(a.ndim))
        else:
            if len(set(axes)) < len(axes):
                raise ValueError("Duplicate axes are not allowed.")

        for each_axis in axes:
            if len(a.chunks[each_axis]) != 1:
                raise ValueError('The axis along the detrending is upon '
                                'cannot be chunked.')

        if len(axes) == 1:
            return dsar.map_blocks(sps.detrend, a, axis=axes[0],
                                   chunks=a.chunks, dtype=a.dtype
                                  )
        else:
            for each_axis in range(a.ndim):
                if each_axis not in axes:
                    if len(a.chunks[each_axis]) != a.shape[each_axis]:
                        raise ValueError("The axes other than ones to detrend "
                                        "over should have a chunk length of 1.")
            return dsar.map_blocks(detrend_func, a, axes,
                                   chunks=a.chunks, dtype=a.dtype
                                  )
Ejemplo n.º 2
0
    def align(self, target=None, weighted=False, procrustes=False,
    error=0.0001, maxcyc=10):
        '''
        Aligns the frames in atrajectory to some reference structure, with
        optional mass-weighting.

        Arguments:
            target:
            If given, a reference structure to fir to, as a [N,3] numpy array.

            weighted:
            If specified, mass-weighted fitting is done.

            procrustes:
            If specified , procrustes iterative fitting is done to convergence.

            error:
            Defines the target error for the procrustes fit.

            maxcyc:
            Defines the maximum number of iterations for the procrustes method.
        '''

        self.reset()
        if target is None:
            targ = self.x[0]
        else:
            targ = da.from_array(target, chunks = CHUNKS)

        if weighted:
            weights = self.masses
        else:
            weights = np.ones_like(self.masses)
        weights = da.from_array(np.stack([weights,] * 3).T, chunks=CHUNKS)

        self.x = da.map_blocks(fastfitting.fitted_traj, self.x, targ, weights)
        if not procrustes:
            return

        avg = self.x.mean(axis=0)
        err = avg - targ
        err = (err*err).mean().compute()
        cycle = 1
        while err > error and cycle < maxcyc:
            target = avg
            self.reset()
            self.x = da.map_blocks(fastfitting.fitted_traj, self.x, target, weights)
            avg = self.x.mean(axis=0).compute()
            avg = da.from_array(avg, chunks=CHUNKS)
            err = avg - target
            err = (err*err).mean().compute()
            cycle += 1
        print 'Procrustes converged in {} cycles with error {}'.format(cycle,
        err)
Ejemplo n.º 3
0
def lazy_elementwise(lazy_array, elementwise_op):
    """
    Apply a (numpy-style) elementwise array operation to a lazy array.

    Elementwise means that it performs a independent calculation at each point
    of the input, producing a result array of the same shape.

    Args:

    * lazy_array:
        The lazy array object to operate on.
    * elementwise_op:
        The elementwise operation, a function operating on numpy arrays.

    .. note:

        A single-point "dummy" call is made to the operation function, to
        determine dtype of the result.
        This return dtype must be stable in actual operation (!)

    """
    # This is just a wrapper to provide an Iris-specific abstraction for a
    # lazy operation in Dask (map_blocks).

    # Explicitly determine the return type with a dummy call.
    # This makes good practical sense for unit conversions, as a Unit.convert
    # call may cast to float, or not, depending on unit equality : Thus, it's
    # much safer to get udunits to decide that for us.
    dtype = elementwise_op(np.zeros(1, lazy_array.dtype)).dtype

    return da.map_blocks(elementwise_op, lazy_array, dtype=dtype)
Ejemplo n.º 4
0
def _get_date_field(values, name, dtype):
    """Indirectly access pandas' libts.get_date_field by wrapping data
    as a Series and calling through `.dt` attribute.

    Parameters
    ----------
    values : np.ndarray or dask.array-like
        Array-like container of datetime-like values
    name : str
        Name of datetime field to access
    dtype : dtype-like
        dtype for output date field values

    Returns
    -------
    datetime_fields : same type as values
        Array-like of datetime fields accessed for each element in values

    """
    if isinstance(values, dask_array_type):
        from dask.array import map_blocks
        return map_blocks(_access_through_series,
                          values, name, dtype=dtype)
    else:
        return _access_through_series(values, name)
Ejemplo n.º 5
0
    def _method(self, method_name, chunks=None, drop_axis=None, **kwargs):
        if chunks is None:
            # no shape change
            chunks = self.chunks

        if self.mask is None:
            # simple case, no mask
            def f(block):
                g = _ndarray.GenotypeArray(block)
                method = getattr(g, method_name)
                return method(**kwargs)
            out = self.map_blocks(f, chunks=chunks, drop_axis=drop_axis)

        else:
            # map with mask
            def f(block, bmask):
                g = _ndarray.GenotypeArray(block)
                g.mask = bmask[:, :, 0]
                method = getattr(g, method_name)
                return method(**kwargs)
            m = self.mask[:, :, None]
            out = da.map_blocks(f, self, m, chunks=chunks,
                                drop_axis=drop_axis)

        return out
Ejemplo n.º 6
0
 def from_packed(packed, chunks=None):
     def f(block):
         return _ndarray.GenotypeArray.from_packed(block)
     packed = ensure_dask_array(packed, chunks)
     chunks = (packed.chunks[0], packed.chunks[1], (2,))
     out = da.map_blocks(f, packed, chunks=chunks, new_axis=2)
     return view_subclass(out, GenotypeDaskArray)
Ejemplo n.º 7
0
    def interpolate_angles(self, angles, resolution):
        # FIXME: interpolate in cartesian coordinates if the lons or lats are
        # problematic
        from geotiepoints.multilinear import MultilinearInterpolator

        geocoding = self.root.find('.//Tile_Geocoding')
        rows = int(geocoding.find('Size[@resolution="' + str(resolution) + '"]/NROWS').text)
        cols = int(geocoding.find('Size[@resolution="' + str(resolution) + '"]/NCOLS').text)

        smin = [0, 0]
        smax = np.array(angles.shape) - 1
        orders = angles.shape
        minterp = MultilinearInterpolator(smin, smax, orders)
        minterp.set_values(da.atleast_2d(angles.ravel()))

        def _do_interp(minterp, xcoord, ycoord):
            interp_points2 = np.vstack((xcoord.ravel(),
                                        ycoord.ravel()))
            res = minterp(interp_points2)
            return res.reshape(xcoord.shape)

        x = da.arange(rows, dtype=angles.dtype, chunks=CHUNK_SIZE) / (rows-1) * (angles.shape[0] - 1)
        y = da.arange(cols, dtype=angles.dtype, chunks=CHUNK_SIZE) / (cols-1) * (angles.shape[1] - 1)
        xcoord, ycoord = da.meshgrid(x, y)
        return da.map_blocks(_do_interp, minterp, xcoord, ycoord, dtype=angles.dtype,
                             chunks=xcoord.chunks)
Ejemplo n.º 8
0
def char_to_bytes(arr):
    """Convert numpy/dask arrays from characters to fixed width bytes."""
    if arr.dtype != 'S1':
        raise ValueError("argument must have dtype='S1'")

    if not arr.ndim:
        # no dimension to concatenate along
        return arr

    size = arr.shape[-1]

    if not size:
        # can't make an S0 dtype
        return np.zeros(arr.shape[:-1], dtype=np.string_)

    if isinstance(arr, dask_array_type):
        import dask.array as da

        if len(arr.chunks[-1]) > 1:
            raise ValueError('cannot stacked dask character array with '
                             'multiple chunks in the last dimension: {}'
                             .format(arr))

        dtype = np.dtype('S' + str(arr.shape[-1]))
        return da.map_blocks(_numpy_char_to_bytes, arr,
                             dtype=dtype,
                             chunks=arr.chunks[:-1],
                             drop_axis=[arr.ndim - 1])
    else:
        return StackedBytesArray(arr)
Ejemplo n.º 9
0
def gufunc_idxmin(x, y, axis=None):
    import dask.array as da
    indx = x.argmin(axis=axis)
    func = functools.partial(_index_from_1d_array, y)

    if isinstance(x, da.Array):
        return da.map_blocks(func, indx, dtype=indx.dtype)
    else:
        return func(indx)
Ejemplo n.º 10
0
    def _get_solar_flux(self, band):
        """Get the solar flux for the band."""
        solar_flux = self.cal['solar_flux'].isel(bands=band).values
        d_index = self.cal['detector_index'].fillna(0).astype(int)

        def get_items(idx, solar_flux):
            return solar_flux[idx]

        return da.map_blocks(get_items, d_index.data, solar_flux=solar_flux,
                             dtype=solar_flux.dtype)
Ejemplo n.º 11
0
def interp_func(var, x, new_x, method, kwargs):
    """
    multi-dimensional interpolation for array-like. Interpolated axes should be
    located in the last position.

    Parameters
    ----------
    var: np.ndarray or dask.array.Array
        Array to be interpolated. The final dimension is interpolated.
    x: a list of 1d array.
        Original coordinates. Should not contain NaN.
    new_x: a list of 1d array
        New coordinates. Should not contain NaN.
    method: string
        {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic'} for
        1-dimensional itnterpolation.
        {'linear', 'nearest'} for multidimensional interpolation
    **kwargs:
        Optional keyword arguments to be passed to scipy.interpolator

    Returns
    -------
    interpolated: array
        Interpolated array

    Note
    ----
    This requiers scipy installed.

    See Also
    --------
    scipy.interpolate.interp1d
    """
    if not x:
        return var.copy()

    if len(x) == 1:
        func, kwargs = _get_interpolator(method, vectorizeable_only=True,
                                         **kwargs)
    else:
        func, kwargs = _get_interpolator_nd(method, **kwargs)

    if isinstance(var, dask_array_type):
        import dask.array as da

        _assert_single_chunk(var, range(var.ndim - len(x), var.ndim))
        chunks = var.chunks[:-len(x)] + new_x[0].shape
        drop_axis = range(var.ndim - len(x), var.ndim)
        new_axis = range(var.ndim - len(x), var.ndim - len(x) + new_x[0].ndim)
        return da.map_blocks(_interpnd, var, x, new_x, func, kwargs,
                             dtype=var.dtype, chunks=chunks,
                             new_axis=new_axis, drop_axis=drop_axis)

    return _interpnd(var, x, new_x, func, kwargs)
Ejemplo n.º 12
0
    def map_alleles(self, mapping):

        def f(block, bmapping):
            ac = _ndarray.AlleleCountsArray(block)
            return ac.map_alleles(bmapping)

        # obtain dask array
        mapping = da.from_array(mapping, chunks=(self.chunks[0], None))

        # map blocks
        out = da.map_blocks(f, self, mapping, chunks=self.chunks)
        return view_subclass(out, AlleleCountsDaskArray)
Ejemplo n.º 13
0
def bytes_to_char(arr):
    """Convert numpy/dask arrays from fixed width bytes to characters."""
    if arr.dtype.kind != 'S':
        raise ValueError('argument must have a fixed-width bytes dtype')

    if isinstance(arr, dask_array_type):
        import dask.array as da
        return da.map_blocks(_numpy_bytes_to_char, arr,
                             dtype='S1',
                             chunks=arr.chunks + ((arr.dtype.itemsize,)),
                             new_axis=[arr.ndim])
    else:
        return _numpy_bytes_to_char(arr)
Ejemplo n.º 14
0
    def map_alleles(self, mapping, **kwargs):

        def f(block, bmapping):
            h = _ndarray.HaplotypeArray(block)
            return h.map_alleles(bmapping)

        # obtain dask array
        mapping = da.from_array(mapping, chunks=(self.chunks[0], None))

        # map blocks
        out = da.map_blocks(f, self, mapping,
                            chunks=self.chunks)
        return view_subclass(out, HaplotypeDaskArray)
Ejemplo n.º 15
0
    def map_alleles(self, mapping, **kwargs):

        def f(block, bmapping):
            g = _ndarray.GenotypeArray(block)
            m = bmapping[:, 0, :]
            return g.map_alleles(m)

        # obtain dask array
        mapping = da.from_array(mapping, chunks=(self.chunks[0], None))

        # map blocks
        out = da.map_blocks(f, self, mapping[:, None, :],
                            chunks=self.chunks)
        return view_subclass(out, GenotypeDaskArray)
Ejemplo n.º 16
0
def HaloVelocityDispersion(mass, cosmo, redshift, mdef='vir'):
    """ Compute the velocity dispersion of halo from Mass.

        This is a simple model suggested by Martin White.

        See http://adsabs.harvard.edu/abs/2008ApJ...672..122E
    """

    mass, redshift = da.broadcast_arrays(mass, redshift)
    def compute_vdisp(mass, redshift):
        h = cosmo.efunc(redshift)
        return 1100. * (h * mass / 1e15) ** 0.33333

    return da.map_blocks(compute_vdisp, mass, redshift, dtype=mass.dtype)
Ejemplo n.º 17
0
def HaloConcentration(mass, cosmo, redshift, mdef='vir'):
    """
    Return halo concentration from halo mass, based on the analytic fitting
    formulas presented in
    `Dutton and Maccio 2014 <https://arxiv.org/abs/1402.7073>`_.

    .. note::
        The units of the input mass are assumed to be :math:`M_{\odot}/h`

    Parameters
    ----------
    mass : array_like
        either a numpy or dask array specifying the halo mass; units
        assumed to be :math:`M_{\odot}/h`
    cosmo : :class:`~nbodykit.cosmology.cosmology.Cosmology`
        the cosmology instance used in the analytic formula
    redshift : float
        compute the c(M) relation at this redshift
    mdef : str, optional
        string specifying the halo mass definition to use; should be
        'vir' or 'XXXc' or 'XXXm' where 'XXX' is an int specifying the
        overdensity

    Returns
    -------
    concen : :class:`dask.array.Array`
        a dask array holding the analytic concentration values

    References
    ----------
    Dutton and Maccio, "Cold dark matter haloes in the Planck era: evolution
    of structural parameters for Einasto and NFW profiles", 2014, arxiv:1402.7073

    """
    from halotools.empirical_models import NFWProfile

    mass, redshift = da.broadcast_arrays(mass, redshift)

    kws = {'cosmology':cosmo.to_astropy(), 'conc_mass_model':'dutton_maccio14', 'mdef':mdef}

    def get_nfw_conc(mass, redshift):
        kw1 = {}
        kw1.update(kws)
        kw1['redshift'] = redshift
        model = NFWProfile(**kw1)
        return model.conc_NFWmodel(prim_haloprop=mass)

    return da.map_blocks(get_nfw_conc, mass, redshift, dtype=mass.dtype)
Ejemplo n.º 18
0
def HaloRadius(mass, cosmo, redshift, mdef='vir'):
    r"""
    Return proper halo radius from halo mass, based on the specified mass definition.
    This is independent of halo profile, and simply returns

    .. math::

        R = \left [ 3 M /(4\pi\Delta) \right]^{1/3}

    where :math:`\Delta` is the density threshold, which depends on cosmology,
    redshift, and mass definition

    .. note::
        The units of the input mass are assumed to be :math:`M_{\odot}/h`

    Parameters
    ----------
    mass : array_like
        either a numpy or dask array specifying the halo mass; units
        assumed to be :math:`M_{\odot}/h`
    cosmo : :class:`~nbodykit.cosmology.cosmology.Cosmology`
        the cosmology instance
    redshift : float
        compute the density threshold which determines the R(M) relation
        at this redshift
    mdef : str, optional
        string specifying the halo mass definition to use; should be
        'vir' or 'XXXc' or 'XXXm' where 'XXX' is an int specifying the
        overdensity

    Returns
    -------
    radius : :class:`dask.array.Array`
        a dask array holding the halo radius in 'physical Mpc/h [sic]'.
        This is proper Mpc/h, to convert to comoving, divide this by scaling factor.

    """
    from halotools.empirical_models import halo_mass_to_halo_radius

    mass, redshift = da.broadcast_arrays(mass, redshift)

    kws = {'cosmology':cosmo.to_astropy(), 'mdef':mdef}

    def mass_to_radius(mass, redshift):
        return halo_mass_to_halo_radius(mass=mass, redshift=redshift, **kws)

    return da.map_blocks(mass_to_radius, mass, redshift, dtype=mass.dtype)
Ejemplo n.º 19
0
def interpolate_xarray_linear(xpoints, ypoints, values, shape):
    """Interpolate linearly, generating a dask array."""
    from scipy.interpolate.interpnd import (LinearNDInterpolator,
                                            _ndim_coords_from_arrays)
    points = _ndim_coords_from_arrays(np.vstack((np.asarray(ypoints),
                                                 np.asarray(xpoints))).T)

    interpolator = LinearNDInterpolator(points, values)

    def intp(grid_x, grid_y, interpolator):
        return interpolator((grid_y, grid_x))

    grid_x, grid_y = da.meshgrid(da.arange(shape[1], chunks=CHUNK_SIZE),
                                 da.arange(shape[0], chunks=CHUNK_SIZE))
    # workaround for non-thread-safe first call of the interpolator:
    interpolator((0, 0))
    res = da.map_blocks(intp, grid_x, grid_y, interpolator=interpolator)

    return DataArray(res, dims=('y', 'x'))
Ejemplo n.º 20
0
    def precompute(self, mask=None, cache_dir=False, swath_usage=0,
                   **kwargs):
        """Generate row and column arrays and store it for later use.

        :param swath_usage: minimum ratio of number of input pixels to
                            number of pixels used in output

        Note: The `mask` keyword should be provided if geolocation may be
              valid where data points are invalid. This defaults to the
              `mask` attribute of the `data` numpy masked array passed to
              the `resample` method.

        """

        del kwargs
        source_geo_def = self.source_geo_def
        target_geo_def = self.target_geo_def

        if cache_dir:
            LOG.warning("'cache_dir' is not used by EWA resampling")

        # SatPy/PyResample don't support dynamic grids out of the box yet
        lons, lats = source_geo_def.get_lonlats()
        # we are remapping to a static unchanging grid/area with all of
        # its parameters specified
        chunks = (2,) + lons.chunks
        res = da.map_blocks(self._call_ll2cr, lons.data, lats.data,
                            target_geo_def, swath_usage,
                            dtype=lons.dtype, chunks=chunks, new_axis=[0])
        cols = res[0]
        rows = res[1]

        # save the dask arrays in the class instance cache
        # the on-disk cache will store the numpy arrays
        self.cache = {
            "rows": rows,
            "cols": cols,
        }

        return None
Ejemplo n.º 21
0
    def count_alleles(self, max_allele=None, subpop=None):

        # if max_allele not specified, count all alleles
        if max_allele is None:
            max_allele = self.max().compute()[()]

        # deal with subpop
        if subpop:
            gd = self.take(subpop, axis=1)
        else:
            gd = self

        # determine output chunks - preserve axis0; change axis1, axis2
        chunks = (gd.chunks[0], (1,)*len(gd.chunks[1]), (max_allele+1,))

        if self.mask is None:

            # simple case, no mask
            def f(block):
                gb = _ndarray.GenotypeArray(block)
                return gb.count_alleles(max_allele=max_allele)[:, None, :]

            # map blocks and reduce
            out = gd.map_blocks(f, chunks=chunks).sum(axis=1)

        else:

            # map with mask
            def f(block, bmask):
                g = _ndarray.GenotypeArray(block)
                g.mask = bmask[:, :, 0]
                return g.count_alleles(max_allele=max_allele)[:, None, :]

            md = self.mask[:, :, None]
            out = da.map_blocks(f, gd, md, chunks=chunks).sum(axis=1)

        return view_subclass(out, AlleleCountsDaskArray)
Ejemplo n.º 22
0
def _get_date_field(values, name, dtype):
    """Indirectly access pandas' libts.get_date_field by wrapping data
    as a Series and calling through `.dt` attribute.

    Parameters
    ----------
    values : np.ndarray or dask.array-like
        Array-like container of datetime-like values
    name : str
        Name of datetime field to access
    dtype : dtype-like
        dtype for output date field values

    Returns
    -------
    datetime_fields : same type as values
        Array-like of datetime fields accessed for each element in values

    """
    if isinstance(values, dask_array_type):
        from dask.array import map_blocks
        return map_blocks(_access_through_series, values, name, dtype=dtype)
    else:
        return _access_through_series(values, name)
Ejemplo n.º 23
0
def dask_rolling_wrapper(moving_func, a, window, min_count=None, axis=-1):
    """Wrapper to apply bottleneck moving window funcs on dask arrays"""
    import dask.array as da

    dtype, fill_value = dtypes.maybe_promote(a.dtype)
    a = a.astype(dtype)
    # inputs for overlap
    if axis < 0:
        axis = a.ndim + axis
    depth = {d: 0 for d in range(a.ndim)}
    depth[axis] = (window + 1) // 2
    boundary = {d: fill_value for d in range(a.ndim)}
    # Create overlap array.
    ag = da.overlap.overlap(a, depth=depth, boundary=boundary)
    # apply rolling func
    out = da.map_blocks(moving_func,
                        ag,
                        window,
                        min_count=min_count,
                        axis=axis,
                        dtype=a.dtype)
    # trim array
    result = da.overlap.trim_internal(out, depth)
    return result
Ejemplo n.º 24
0
    def to_dask(self, pad_values=None):
        num_channels = self.bounding_shape["c"]
        if self.needs_padding:
            if pad_values is None:
                raise ValueError(
                    "Data must be padded but no pad values were supplied!")
            elif len(pad_values) != num_channels:
                raise ValueError(
                    f"Length of pad values {pad_values} does not match the length of the channel axis ({num_channels})"
                )

        chunks = (
            (1, ) * self.bounding_shape["z"],
            *[self.bounding_shape[k] for k in ("c", "y", "x")],
        )
        darr = da.map_blocks(
            _chunked_fibsem_loader,
            self.filenames,
            self.axes["c"],
            pad_values,
            chunks=chunks,
            dtype=self.dtypes[0],
        )
        return darr
Ejemplo n.º 25
0
    def inverse_transform(self, y: Union[ArrayLike, SeriesType]):
        check_is_fitted(self, "classes_")
        y = self._check_array(y)

        if isinstance(y, da.Array):
            if getattr(self, "dtype_", None):
                # -> Series[category]
                if self.dtype_ is not None:
                    result = (dd.from_dask_array(y).astype(
                        "category").cat.set_categories(
                            np.arange(len(
                                self.classes_))).cat.rename_categories(
                                    self.dtype_.categories))
                if self.dtype_.ordered:
                    result = result.cat.as_ordered()
                return result
            else:
                return da.map_blocks(
                    getitem,
                    self.classes_,
                    y,
                    dtype=self.classes_.dtype,
                    chunks=y.chunks,
                )
        else:
            y = np.asarray(y)
            if getattr(self, "dtype_", None):
                if self.dtype_ is not None:
                    return pd.Series(
                        pd.Categorical.from_codes(
                            y,
                            categories=self.dtype_.categories,
                            ordered=self.dtype_.ordered,
                        ))
            else:
                return self.classes_[y]
Ejemplo n.º 26
0
def _round_field(values, name, freq):
    """Indirectly access pandas rounding functions by wrapping data
    as a Series and calling through `.dt` attribute.

    Parameters
    ----------
    values : np.ndarray or dask.array-like
        Array-like container of datetime-like values
    name : str (ceil, floor, round)
        Name of rounding function
    freq : a freq string indicating the rounding resolution

    Returns
    -------
    rounded timestamps : same type as values
        Array-like of datetime fields accessed for each element in values

    """
    if isinstance(values, dask_array_type):
        from dask.array import map_blocks
        return map_blocks(_round_series,
                          values, name, freq=freq, dtype=np.datetime64)
    else:
        return _round_series(values, name, freq)
Ejemplo n.º 27
0
def gap_fill(x: xr.DataArray, fallback: xr.DataArray, nodata=None, attrs=None):
    """Fill missing values in `x` with values from `fallback`.

    x,fallback are expected to be xarray.DataArray with identical shape and dtype.

        out[pix] = x[pix] if x[pix] != x.nodata else fallback[pix]
    """

    if nodata is None:
        nodata = getattr(x, "nodata", None)

    if nodata is None:
        nodata = default_nodata(x.dtype)
    else:
        nodata = x.dtype.type(nodata)

    if attrs is None:
        attrs = x.attrs.copy()

    if dask.is_dask_collection(x):
        data = da.map_blocks(
            _gap_fill_np,
            x.data,
            fallback.data,
            nodata,
            name=randomize("gap_fill"),
            dtype=x.dtype,
        )
    else:
        data = _gap_fill_np(x.data, fallback.data, nodata)

    return xr.DataArray(data,
                        attrs=attrs,
                        dims=x.dims,
                        coords=x.coords,
                        name=x.name)
Ejemplo n.º 28
0
    def precompute(self, cache_dir=None, swath_usage=0, **kwargs):
        """Generate row and column arrays and store it for later use."""
        if kwargs.get('mask') is not None:
            LOG.warning("'mask' parameter has no affect during EWA "
                        "resampling")

        del kwargs
        source_geo_def = self.source_geo_def
        target_geo_def = self.target_geo_def

        if cache_dir:
            LOG.warning("'cache_dir' is not used by EWA resampling")

        # SatPy/PyResample don't support dynamic grids out of the box yet
        lons, lats = source_geo_def.get_lonlats()
        if isinstance(lons, xr.DataArray):
            # get dask arrays
            lons = lons.data
            lats = lats.data
        # we are remapping to a static unchanging grid/area with all of
        # its parameters specified
        chunks = (2,) + lons.chunks
        res = da.map_blocks(self._call_ll2cr, lons, lats,
                            target_geo_def, swath_usage,
                            dtype=lons.dtype, chunks=chunks, new_axis=[0])
        cols = res[0]
        rows = res[1]

        # save the dask arrays in the class instance cache
        # the on-disk cache will store the numpy arrays
        self.cache = {
            "rows": rows,
            "cols": cols,
        }

        return None
Ejemplo n.º 29
0
def get_slp(daskArray, omp_threads=1):
    t = fetch_variable(daskArray, "T")
    p = fetch_variable(daskArray, "P")
    pb = fetch_variable(daskArray, "PB")
    qvapor = fetch_variable(daskArray, "QVAPOR", include_meta=True)
    ph = fetch_variable(daskArray, "PH")
    phb = fetch_variable(daskArray, "PHB")
    dtype = p.dtype

    full_t = map_blocks(wrapped_add, t, Constants.T_BASE, dtype=dtype)
    full_p = map_blocks(wrapped_add, p, pb, dtype=dtype)
    qvapor = qvapor.where(qvapor >= 0, 0)

    del (t)
    del (p)
    del (pb)

    pre_full_ph = map_blocks(wrapped_add, ph, phb, dtype=dtype)
    full_ph = map_blocks(wrapped_div, pre_full_ph, Constants.G, dtype=dtype)
    destag_ph = wrapped_destagger(full_ph, -3)

    del (full_ph)
    del (ph)
    del (phb)

    tk = map_blocks(tk_wrap, full_p, full_t, omp_threads, dtype=dtype)
    slp = map_blocks(slp_wrap,
                     destag_ph,
                     tk,
                     full_p,
                     qvapor.data,
                     omp_threads,
                     dtype=dtype)
    slp_calc = slp

    return slp_calc
Ejemplo n.º 30
0
import dask.array as da
from dask.distributed import Client
import numpy as np
client = Client(scheduler_file='/home/mm/scheduler.json')
client

ntime=200
npool=9
npatch=10
nland=1000
chunk_shape=(ntime,npool,npool,npatch ,1)
narr=np.ones(chunk_shape)
#print(narr)
arr1=da.from_array( narr)
#print(arr1)
garr1=da.stack([ arr1 for i in range(nland)],axis=4) 
#print(garr1)
def myfunc(chunk1,chunk2):
    return chunk1+chunk2

res=da.map_blocks(
   myfunc,
   garr1,
   garr1
   )
res

# %time res.compute()


Ejemplo n.º 31
0
def interp_func(var, x, new_x, method, kwargs):
    """
    multi-dimensional interpolation for array-like. Interpolated axes should be
    located in the last position.

    Parameters
    ----------
    var: np.ndarray or dask.array.Array
        Array to be interpolated. The final dimension is interpolated.
    x: a list of 1d array.
        Original coordinates. Should not contain NaN.
    new_x: a list of 1d array
        New coordinates. Should not contain NaN.
    method: string
        {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic'} for
        1-dimensional itnterpolation.
        {'linear', 'nearest'} for multidimensional interpolation
    **kwargs:
        Optional keyword arguments to be passed to scipy.interpolator

    Returns
    -------
    interpolated: array
        Interpolated array

    Note
    ----
    This requiers scipy installed.

    See Also
    --------
    scipy.interpolate.interp1d
    """
    if not x:
        return var.copy()

    if len(x) == 1:
        func, kwargs = _get_interpolator(method,
                                         vectorizeable_only=True,
                                         **kwargs)
    else:
        func, kwargs = _get_interpolator_nd(method, **kwargs)

    if isinstance(var, dask_array_type):
        import dask.array as da

        _assert_single_chunk(var, range(var.ndim - len(x), var.ndim))
        chunks = var.chunks[:-len(x)] + new_x[0].shape
        drop_axis = range(var.ndim - len(x), var.ndim)
        new_axis = range(var.ndim - len(x), var.ndim - len(x) + new_x[0].ndim)
        return da.map_blocks(_interpnd,
                             var,
                             x,
                             new_x,
                             func,
                             kwargs,
                             dtype=var.dtype,
                             chunks=chunks,
                             new_axis=new_axis,
                             drop_axis=drop_axis)

    return _interpnd(var, x, new_x, func, kwargs)
Ejemplo n.º 32
0
def local_affine_to_position_field(shape,
                                   spacing,
                                   local_affines,
                                   output,
                                   blocksize=[
                                       256,
                                   ] * 3):
    """
    """

    with distributed.distributedState() as ds:

        # get number of jobs needed
        block_grid = np.ceil(np.array(shape) / blocksize).astype(int)
        nblocks = np.prod(block_grid)

        # set up the cluster
        ds.initializeLSFCluster(
            job_extra=["-P multifish"],
            cores=4,
            memory="64GB",
            ncpus=4,
            threads_per_worker=8,
            mem=64000,
        )
        ds.initializeClient()
        ds.scaleCluster(njobs=nblocks)

        # augment the blocksize by the fixed overlap size
        pads = [2 * int(round(x / 8)) for x in blocksize]
        blocksize_with_overlap = np.array(blocksize) + pads

        # get a grid used for each affine
        grid = position_grid_dask(blocksize_with_overlap,
                                  list(blocksize_with_overlap))
        grid = grid * spacing.astype(np.float32)

        # wrap local_affines as dask array
        local_affines_da = da.from_array(local_affines, chunks=(1, 1, 1, 3, 4))

        # compute affine transforms as position coordinates, lazy dask arrays
        coords = da.map_blocks(
            affine_to_grid_dask,
            local_affines_da,
            grid=grid,
            displacement=True,
            new_axis=[5, 6],
            chunks=(
                1,
                1,
                1,
            ) + tuple(grid.shape),
            dtype=np.float32,
        )

        # stitch affine position fields
        coords = stitch.stitch_fields(coords, blocksize)

        # crop to original shape
        coords = coords[:shape[0], :shape[1], :shape[2]]

        # convert to position field
        coords = coords + position_grid_dask(
            shape, blocksize) * spacing.astype(np.float32)
        coords = da.around(coords, decimals=2)

        # write in parallel as 3D array to zarr file
        compressor = Blosc(cname='zstd', clevel=9, shuffle=Blosc.BITSHUFFLE)
        coords_disk = zarr.open(
            output,
            'w',
            shape=coords.shape,
            chunks=tuple(blocksize + [
                3,
            ]),
            dtype=coords.dtype,
            compressor=compressor,
        )
        da.to_zarr(coords, coords_disk)

        # return pointer to zarr file
        return coords_disk
Ejemplo n.º 33
0
def preprocessing(dir_root, save_root, cameraNoiseMat=cameraNoiseMat, nsplit = (4, 4), num_t_chunks = 80,\
                  dask_tmp=None, memory_limit=0, is_bz2=False, is_singlePlane=False, down_sample_registration=1):
    from ..utils.getCameraInfo import getCameraInfo
    from tqdm import tqdm
    from ..utils.fileio import du
    # set worker
    cluster, client = fdask.setup_workers(is_local=True, dask_tmp=dask_tmp, memory_limit=memory_limit)
    print_client_links(cluster)
    
    if isinstance(save_root, list):
        save_root_ext = save_root[1]
        save_root = save_root[0]
    
    print(f'Tmp files will be saved to {save_root}')
    if 'save_root_ext' in locals():
        print(f'With extended drive to {save_root_ext}')

    if not os.path.exists(f'{save_root}/denoised_data.zarr'):
        print('========================')
        print('Getting data infos')
        if not is_bz2:
            files = sorted(glob(dir_root+'/*.h5'))
            chunks = File(files[0],'r')['default'].shape
            if not is_singlePlane:
                data = da.stack([da.from_array(File(fn,'r')['default'], chunks=chunks) for fn in files])
            else:
                if len(chunks)==2:
                    data = da.stack([da.from_array(File(fn,'r')['default'], chunks=chunks) for fn in files])
                else:
                    data = da.concatenate([da.from_array(File(fn,'r')['default'], chunks=(1, chunks[1], chunks[2])) for fn in files], axis=0)
            cameraInfo = getCameraInfo(dir_root)
        else:
            import xml.etree.ElementTree as ET
            from utils import load_bz2file
            dims = ET.parse(dir_root+'/ch0.xml')
            root = dims.getroot()
            for info in root.findall('info'):
                if info.get('dimensions'):
                    dims = info.get('dimensions')
            dims = dims.split('x')
            dims = [int(float(num)) for num in dims]
            files = sorted(glob(dir_root+'/*.stack.bz2'))
            imread = dask.delayed(lambda v: load_bz2file(v, dims), pure=True)
            lazy_data = [imread(fn) for fn in files]
            sample = lazy_data[0].compute()
            data = da.stack([da.from_delayed(fn, shape=sample.shape, dtype=sample.dtype) for fn in lazy_data])
            cameraInfo = getCameraInfo(dir_root)
            pixel_x0, pixel_x1, pixel_y0, pixel_y1 = [int(_) for _ in cameraInfo['camera_roi'].split('_')]
            pixel_x0 = pixel_x0-1
            pixel_y0 = pixel_y0-1
            cameraInfo['camera_roi'] = '%d_%d_%d_%d'%(pixel_x0, pixel_x1, pixel_y0, pixel_y1)
            chunks = sample.shape
        # pixel denoise
        print('========================')
        print('Denoising camera noise')
        if not is_singlePlane:
            denoised_data = data.map_blocks(lambda v: pixelDenoiseImag(v, cameraNoiseMat=cameraNoiseMat, cameraInfo=cameraInfo))
        else:
            denoised_data = data.map_blocks(lambda v: pixelDenoiseImag(v, cameraNoiseMat=cameraNoiseMat, cameraInfo=cameraInfo), new_axis=1)
        print('Denoising camera noise -- save data')
        denoised_data.to_zarr(f'{save_root}/denoised_data.zarr')
        num_t = denoised_data.shape[0]
        
    print('Denoising camera noise -- load saved data')
    denoised_data = da.from_zarr(f'{save_root}/denoised_data.zarr')
    chunks = denoised_data.shape[1:]
    num_t = denoised_data.shape[0]

    # save and compute reference image
    print('Compute reference image ---')
    if not os.path.exists(f'{save_root}/motion_fix_.h5'):
        med_win = len(denoised_data)//2
        ref_img = denoised_data[med_win-50:med_win+50].mean(axis=0).compute()
        save_h5(f'{save_root}/motion_fix_.h5', ref_img, dtype='float16')

    print('--- Done computing reference image')

    # compute affine transform
    print('Registration to reference image ---')
    # create trans_affs file
    if not os.path.exists(f'{save_root}/trans_affs.npy'):
        ref_img = File(f'{save_root}/motion_fix_.h5', 'r')['default'].value
        ref_img = ref_img.max(axis=0, keepdims=True)
        if down_sample_registration==1:
            trans_affine = denoised_data.map_blocks(lambda x: estimate_rigid2d(x, fixed=ref_img), dtype='float32', drop_axis=(3), chunks=(1,4,4)).compute()
        else:
            #### downsample trans_affine case
            trans_affine = denoised_data[0::down_sample_registration].map_blocks(lambda x: estimate_rigid2d(x, fixed=ref_img), dtype='float32', drop_axis=(3), chunks=(1,4,4)).compute()
            len_dat = denoised_data.shape[0]
            trans_affine = rigid_interp(trans_affine, down_sample_registration, len_dat)
        # save trans_affs file
        np.save(f'{save_root}/trans_affs.npy', trans_affine)
    # load trans_affs file
    trans_affine_ = np.load(f'{save_root}/trans_affs.npy')
    trans_affine_ = da.from_array(trans_affine_, chunks=(1,4,4))
    print('--- Done registration reference image')

    # apply affine transform
    if not os.path.exists(f'{save_root}/motion_corrected_data.zarr'):
        # fix memory issue to load data all together for transpose on local machine
        # load data
        # swap axes
        splits_ = np.array_split(np.arange(num_t).astype('int'), num_t_chunks)
        print(f'Processing total {num_t_chunks} chunks in time.......')
        # estimate size of data to store
        used_ = du(f'{save_root}/denoised_data.zarr/')
        est_data_size = int(used_.decode('utf-8'))//(2**20*num_t_chunks*2)+5 #kb to Gb
        for nz, n_split in enumerate(splits_):
            if not os.path.exists(save_root+'/motion_corrected_data_chunks_%03d.zarr'%(nz)):
                if 'save_root_ext' in locals():
                    if os.path.exists(save_root_ext+'/motion_corrected_data_chunks_%03d.zarr'%(nz)):
                        continue
                print('Apply registration to rechunk layer %03d'%(nz))
                trans_data_ = da.map_blocks(apply_transform3d, denoised_data[n_split], trans_affine_[n_split], chunks=(1, *denoised_data.shape[1:]), dtype='float16')
                print('Starting to rechunk layer %03d'%(nz))
                trans_data_t_z = trans_data_.rechunk((-1, 1, chunks[1]//nsplit[0], chunks[2]//nsplit[1])).transpose((1, 2, 3, 0))
                # check space availablity
                _, _, free_ = shutil.disk_usage(f'{save_root}/')
                if (free_//(2**30)) > est_data_size:
                    print(f'Remaining space {free_//(2**30)} GB..... -- start to save at {save_root}')
                    trans_data_t_z.to_zarr(save_root+'/motion_corrected_data_chunks_%03d.zarr'%(nz))
                else:
                    try:
                        print(f'Remaining space {free_//(2**30)} GB..... -- start to save at {save_root_ext}')
                        trans_data_t_z.to_zarr(save_root_ext+'/motion_corrected_data_chunks_%03d.zarr'%(nz))
                    except Exception as e:
                        # if any error -- break the code
                        print(e)    
                        fdask.terminate_workers(cluster, client)
                        return None
                del trans_data_t_z
                gc.collect()
                print('finishing rechunking time chunk -- %03d of %03d'%(nz, num_t_chunks))

        print('Remove temporal files of registration')
        if os.path.exists(f'{save_root}/denoised_data.zarr'):
            shutil.rmtree(f'{save_root}/denoised_data.zarr')
        for ext_files in tqdm(glob(save_root_ext+'/motion_corrected_data_chunks_*.zarr')):
            print(f'Moving file {ext_files} to Tmp-file folder.....')
            shutil.move(ext_files, save_root+'/')
    fdask.terminate_workers(cluster, client)
    return None
Ejemplo n.º 34
0
def preprocessing_cluster(dir_root, save_root, cameraNoiseMat=cameraNoiseMat, nsplit = (4, 4), num_t_chunks = 80,\
                  dask_tmp=None, memory_limit=0, is_bz2=False, is_singlePlane=False, down_sample_registration=1):
    from ..utils.getCameraInfo import getCameraInfo
    # set worker
    cluster, client = fdask.setup_workers(numCore=200, is_local=False, dask_tmp=dask_tmp, memory_limit=memory_limit)
    print_client_links(cluster)

    if not os.path.exists(f'{save_root}/denoised_data.zarr'):
        if not is_bz2:
            files = sorted(glob(dir_root+'/*.h5'))
            chunks = File(files[0],'r')['default'].shape
            if not is_singlePlane:
                data = da.stack([da.from_array(File(fn,'r')['default'], chunks=chunks) for fn in files])
            else:
                if len(chunks)==2:
                    data = da.stack([da.from_array(File(fn,'r')['default'], chunks=chunks) for fn in files])
                else:
                    data = da.concatenate([da.from_array(File(fn,'r')['default'], chunks=(1, chunks[1], chunks[2])) for fn in files], axis=0)
            cameraInfo = getCameraInfo(dir_root)
        else:
            import xml.etree.ElementTree as ET
            from utils import load_bz2file
            dims = ET.parse(dir_root+'/ch0.xml')
            root = dims.getroot()
            for info in root.findall('info'):
                if info.get('dimensions'):
                    dims = info.get('dimensions')
            dims = dims.split('x')
            dims = [int(float(num)) for num in dims]
            files = sorted(glob(dir_root+'/*.stack.bz2'))
            imread = dask.delayed(lambda v: load_bz2file(v, dims), pure=True)
            lazy_data = [imread(fn) for fn in files]
            sample = lazy_data[0].compute()
            data = da.stack([da.from_delayed(fn, shape=sample.shape, dtype=sample.dtype) for fn in lazy_data])
            cameraInfo = getCameraInfo(dir_root)
            pixel_x0, pixel_x1, pixel_y0, pixel_y1 = [int(_) for _ in cameraInfo['camera_roi'].split('_')]
            pixel_x0 = pixel_x0-1
            pixel_y0 = pixel_y0-1
            cameraInfo['camera_roi'] = '%d_%d_%d_%d'%(pixel_x0, pixel_x1, pixel_y0, pixel_y1)
            chunks = sample.shape
        # pixel denoise
        if not is_singlePlane:
            denoised_data = data.map_blocks(lambda v: pixelDenoiseImag(v, cameraNoiseMat=cameraNoiseMat, cameraInfo=cameraInfo))
        else:
            denoised_data = data.map_blocks(lambda v: pixelDenoiseImag(v, cameraNoiseMat=cameraNoiseMat, cameraInfo=cameraInfo), new_axis=1)
        denoised_data.to_zarr(f'{save_root}/denoised_data.zarr')
        num_t = denoised_data.shape[0]
    else:
        denoised_data = da.from_zarr(f'{save_root}/denoised_data.zarr')
        chunks = denoised_data.shape[1:]
        num_t = denoised_data.shape[0]

    # save and compute reference image
    print('Compute reference image ---')
    if not os.path.exists(f'{save_root}/motion_fix_.h5'):
        med_win = len(denoised_data)//2
        ref_img = denoised_data[med_win-50:med_win+50].mean(axis=0).compute()
        save_h5(f'{save_root}/motion_fix_.h5', ref_img, dtype='float16')

    print('--- Done computing reference image')

    # compute affine transform
    print('Registration to reference image ---')
    # create trans_affs file
    if not os.path.exists(f'{save_root}/trans_affs.npy'):
        ref_img = File(f'{save_root}/motion_fix_.h5', 'r')['default'].value
        ref_img = ref_img.max(axis=0, keepdims=True)
        if down_sample_registration==1:
            trans_affine = denoised_data.map_blocks(lambda x: estimate_rigid2d(x, fixed=ref_img), dtype='float32', drop_axis=(3), chunks=(1,4,4)).compute()
        else:
            #### downsample trans_affine case
            trans_affine = denoised_data[0::down_sample_registration].map_blocks(lambda x: estimate_rigid2d(x, fixed=ref_img), dtype='float32', drop_axis=(3), chunks=(1,4,4)).compute()
            len_dat = denoised_data.shape[0]
            trans_affine = rigid_interp(trans_affine, down_sample_registration, len_dat)
        # save trans_affs file
        np.save(f'{save_root}/trans_affs.npy', trans_affine)
    # load trans_affs file
    trans_affine_ = np.load(f'{save_root}/trans_affs.npy')
    trans_affine_ = da.from_array(trans_affine_, chunks=(1,4,4))
    print('--- Done registration reference image')

    trans_data_ = da.map_blocks(apply_transform3d, denoised_data, trans_affine_, chunks=(1, *denoised_data.shape[1:]), dtype='float16')
    trans_data_t = trans_data_.rechunk((-1, 1, chunks[1]//nsplit[0], chunks[2]//nsplit[1])).transpose((1, 2, 3, 0))
    trans_data_t.to_zarr(f'{save_root}/motion_corrected_data.zarr')
    fdask.terminate_workers(cluster, client)

    print('Remove temporal files of registration')
    if os.path.exists(f'{save_root}/denoised_data.zarr'):
        shutil.rmtree(f'{save_root}/denoised_data.zarr')
    return None
Ejemplo n.º 35
0
def xr_geomedian(ds, axis="time", where=None, **kw):
    """

    :param ds: xr.Dataset|xr.DataArray|numpy array

    Other parameters:
    **kwargs -- passed on to pcm.gnmpcm
       maxiters   : int         1000
       eps        : float       0.0001
       num_threads: int| None   None
    """
    from hdstats import nangeomedian_pcm

    def norm_input(ds, axis):
        if isinstance(ds, xr.DataArray):
            xx = ds
            if len(xx.dims) != 4:
                raise ValueError("Expect 4 dimensions on input: y,x,band,time")
            if axis is not None and xx.dims[3] != axis:
                raise ValueError(
                    f"Can only reduce last dimension, expect: y,x,band,{axis}")
            return None, xx, xx.data
        elif isinstance(ds, xr.Dataset):
            xx = reshape_for_geomedian(ds, axis)
            return ds, xx, xx.data
        else:  # assume numpy or similar
            xx_data = ds
            if xx_data.ndim != 4:
                raise ValueError("Expect 4 dimensions on input: y,x,band,time")
            return None, None, xx_data

    kw.setdefault("nocheck", True)
    kw.setdefault("num_threads", 1)
    kw.setdefault("eps", 1e-6)

    ds, xx, xx_data = norm_input(ds, axis)
    is_dask = dask.is_dask_collection(xx_data)

    if where is not None:
        if is_dask:
            raise NotImplementedError(
                "Dask version doesn't support output masking currently")

        if where.shape != xx_data.shape[:2]:
            raise ValueError("Shape for `where` parameter doesn't match")
        set_nan = ~where
    else:
        set_nan = None

    if is_dask:
        if xx_data.shape[-2:] != xx_data.chunksize[-2:]:
            xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1))

        data = da.map_blocks(
            lambda x: nangeomedian_pcm(x, **kw),
            xx_data,
            name=randomize("geomedian"),
            dtype=xx_data.dtype,
            drop_axis=3,
        )
    else:
        data = nangeomedian_pcm(xx_data, **kw)

    if set_nan is not None:
        data[set_nan, :] = np.nan

    if xx is None:
        return data

    dims = xx.dims[:-1]
    cc = {k: xx.coords[k] for k in dims}
    xx_out = xr.DataArray(data, dims=dims, coords=cc)

    if ds is None:
        xx_out.attrs.update(xx.attrs)
        return xx_out

    ds_out = xx_out.to_dataset(dim="band")
    for b in ds.data_vars.keys():
        src, dst = ds[b], ds_out[b]
        dst.attrs.update(src.attrs)

    return ds_out
Ejemplo n.º 36
0
    def get_reflectance(self, sun_zenith, sat_zenith, azidiff, bandname, redband=None):
        """Get the reflectance from the three sun-sat angles"""
        # Get wavelength in nm for band:
        if isinstance(bandname, float):
            LOG.warning('A wavelength is provided instead of band name - ' +
                        'disregard the relative spectral responses and assume ' +
                        'it is the effective wavelength: %f (micro meter)', bandname)
            wvl = bandname * 1000.0
        else:
            wvl = self.get_effective_wavelength(bandname)
            wvl = wvl * 1000.0

        rayl, wvl_coord, azid_coord, satz_sec_coord, sunz_sec_coord = self.get_reflectance_lut()

        # force dask arrays
        compute = False
        if HAVE_DASK and not isinstance(sun_zenith, Array):
            compute = True
            sun_zenith = from_array(sun_zenith, chunks=sun_zenith.shape)
            sat_zenith = from_array(sat_zenith, chunks=sat_zenith.shape)
            azidiff = from_array(azidiff, chunks=azidiff.shape)
            if redband is not None:
                redband = from_array(redband, chunks=redband.shape)

        clip_angle = rad2deg(arccos(1. / sunz_sec_coord.max()))
        sun_zenith = clip(sun_zenith, 0, clip_angle)
        sunzsec = 1. / cos(deg2rad(sun_zenith))
        clip_angle = rad2deg(arccos(1. / satz_sec_coord.max()))
        sat_zenith = clip(sat_zenith, 0, clip_angle)
        satzsec = 1. / cos(deg2rad(sat_zenith))
        shape = sun_zenith.shape

        if not(wvl_coord.min() < wvl < wvl_coord.max()):
            LOG.warning(
                "Effective wavelength for band %s outside 400-800 nm range!",
                str(bandname))
            LOG.info(
                "Set the rayleigh/aerosol reflectance contribution to zero!")
            if HAVE_DASK:
                chunks = sun_zenith.chunks if redband is None else redband.chunks
                res = zeros(shape, chunks=chunks)
                return res.compute() if compute else res
            else:
                return zeros(shape)

        idx = np.searchsorted(wvl_coord, wvl)
        wvl1 = wvl_coord[idx - 1]
        wvl2 = wvl_coord[idx]

        fac = (wvl2 - wvl) / (wvl2 - wvl1)
        raylwvl = fac * rayl[idx - 1, :, :, :] + (1 - fac) * rayl[idx, :, :, :]
        tic = time.time()

        smin = [sunz_sec_coord[0], azid_coord[0], satz_sec_coord[0]]
        smax = [sunz_sec_coord[-1], azid_coord[-1], satz_sec_coord[-1]]
        orders = [
            len(sunz_sec_coord), len(azid_coord), len(satz_sec_coord)]
        f_3d_grid = atleast_2d(raylwvl.ravel())

        if HAVE_DASK and isinstance(smin[0], Array):
            # compute all of these at the same time before passing to the interpolator
            # otherwise they are computed separately
            smin, smax, orders, f_3d_grid = da.compute(smin, smax, orders, f_3d_grid)
        minterp = MultilinearInterpolator(smin, smax, orders)
        minterp.set_values(f_3d_grid)

        if HAVE_DASK:
            ipn = map_blocks(self._do_interp, minterp, sunzsec, azidiff,
                             satzsec, dtype=raylwvl.dtype, chunks=azidiff.chunks)
        else:
            ipn = self._do_interp(minterp, sunzsec, azidiff, satzsec)

        LOG.debug("Time - Interpolation: {0:f}".format(time.time() - tic))

        ipn *= 100
        res = ipn
        if redband is not None:
            res = where(redband < 20., res,
                        (1 - (redband - 20) / 80) * res)

        res = clip(res, 0, 100)
        if compute:
            res = res.compute()
        return res
Ejemplo n.º 37
0
def Fst(
    ds: Dataset,
    *,
    estimator: Optional[str] = None,
    stat_divergence: Hashable = variables.stat_divergence,
    merge: bool = True,
) -> Dataset:
    """Compute Fst between pairs of cohorts.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    estimator
        Determines the formula to use for computing Fst.
        If None (the default), or ``Hudson``, Fst is calculated
        using the method of Hudson (1992) elaborated by Bhatia et al. (2013),
        (the same estimator as scikit-allel).
        Other supported estimators include ``Nei`` (1986), (the same estimator
        as tskit).
    stat_divergence
        Divergence variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_divergence_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`divergence`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the Fst value between pairs of cohorts, as defined by
    :data:`sgkit.variables.stat_Fst_spec`.
    Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing
    information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[        nan, -0.16666667],
            [-0.16666667,         nan]],
    <BLANKLINE>
        [[        nan, -0.16666667],
            [-0.16666667,         nan]],
    <BLANKLINE>
        [[        nan, -0.33333333],
            [-0.33333333,         nan]],
    <BLANKLINE>
        [[        nan, -0.33333333],
            [-0.33333333,         nan]],
    <BLANKLINE>
        [[        nan,  0.2       ],
            [ 0.2       ,         nan]]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.Fst(ds)["stat_Fst"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[        nan, -0.22222222],
            [-0.22222222,         nan]],
    <BLANKLINE>
        [[        nan,  0.        ],
            [ 0.        ,         nan]]])
    """
    known_estimators = {"Hudson": _Fst_Hudson, "Nei": _Fst_Nei}
    if estimator is not None and estimator not in known_estimators:
        raise ValueError(
            f"Estimator '{estimator}' is not a known estimator: {known_estimators.keys()}"
        )
    estimator = estimator or "Hudson"
    ds = define_variable_if_absent(ds, variables.stat_divergence,
                                   stat_divergence, divergence)
    variables.validate(ds, {stat_divergence: variables.stat_divergence_spec})

    n_cohorts = ds.dims["cohorts"]
    gs = da.asarray(ds.stat_divergence)
    shape = (gs.chunks[0], n_cohorts, n_cohorts)
    fst = da.map_blocks(known_estimators[estimator],
                        gs,
                        chunks=shape,
                        dtype=np.float64)
    # TODO: reinstate assert (first dim could be either variants or windows)
    # assert_array_shape(fst, n_windows, n_cohorts, n_cohorts)
    new_ds = create_dataset(
        {variables.stat_Fst: (("windows", "cohorts_0", "cohorts_1"), fst)})
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 38
0
def xr_geomedian_tmad(ds, axis='time', where=None, **kw):
    """
    :param ds: xr.Dataset|xr.DataArray|numpy array
    Other parameters:
    **kwargs -- passed on to pcm.gnmpcm
       maxiters   : int         1000
       eps        : float       0.0001
       num_threads: int| None   None
    """

    import hdstats
    def gm_tmad(arr, **kw):
        """
        arr: a high dimensional numpy array where the last dimension will be reduced. 
    
        returns: a numpy array with one less dimension than input.
        """
        gm = hdstats.nangeomedian_pcm(arr, **kw)
        nt = kw.pop('num_threads', None)
        emad = hdstats.emad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis]
        smad = hdstats.smad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis]
        bcmad = hdstats.bcmad_pcm(arr, gm, num_threads=nt)[:,:, np.newaxis]
        return np.concatenate([gm, emad, smad, bcmad], axis=-1)


    def norm_input(ds, axis):
        if isinstance(ds, xr.DataArray):
            xx = ds
            if len(xx.dims) != 4:
                raise ValueError("Expect 4 dimensions on input: y,x,band,time")
            if axis is not None and xx.dims[3] != axis:
                raise ValueError(f"Can only reduce last dimension, expect: y,x,band,{axis}")
            return None, xx, xx.data
        elif isinstance(ds, xr.Dataset):
            xx = reshape_for_geomedian(ds, axis)
            return ds, xx, xx.data
        else:  # assume numpy or similar
            xx_data = ds
            if xx_data.ndim != 4:
                raise ValueError("Expect 4 dimensions on input: y,x,band,time")
            return None, None, xx_data

    kw.setdefault('nocheck', False)
    kw.setdefault('num_threads', 1)
    kw.setdefault('eps', 1e-6)

    ds, xx, xx_data = norm_input(ds, axis)
    is_dask = dask.is_dask_collection(xx_data)

    if where is not None:
        if is_dask:
            raise NotImplementedError("Dask version doesn't support output masking currently")

        if where.shape != xx_data.shape[:2]:
            raise ValueError("Shape for `where` parameter doesn't match")
        set_nan = ~where
    else:
        set_nan = None

    if is_dask:
        if xx_data.shape[-2:] != xx_data.chunksize[-2:]:
            xx_data = xx_data.rechunk(xx_data.chunksize[:2] + (-1, -1))

        data = da.map_blocks(lambda x: gm_tmad(x, **kw),
                             xx_data,
                             name=randomize('geomedian'),
                             dtype=xx_data.dtype, 
                             chunks=xx_data.chunks[:-2] + (xx_data.chunks[-2][0]+3,),
                             drop_axis=3)
    else:
        data = gm_tmad(xx_data, **kw)

    if set_nan is not None:
        data[set_nan, :] = np.nan

    if xx is None:
        return data

    dims = xx.dims[:-1]
    cc = {k: xx.coords[k] for k in dims}
    cc[dims[-1]] = np.hstack([xx.coords[dims[-1]].values,['edev', 'sdev', 'bcdev']])
    xx_out = xr.DataArray(data, dims=dims, coords=cc)

    if ds is None:
        xx_out.attrs.update(xx.attrs)
        return xx_out

    ds_out = xx_out.to_dataset(dim='band')
    for b in ds.data_vars.keys():
        src, dst = ds[b], ds_out[b]
        dst.attrs.update(src.attrs)

    return assign_crs(ds_out, crs=ds.geobox.crs)
Ejemplo n.º 39
0
    # subtract mean
    axes = tuple(np.arange(b1.ndim, dtype=int)[b1.ndim//2:])
    b1 -= b1.mean(axis=axes, keepdims=True)
    b2 -= b2.mean(axis=axes, keepdims=True)
    # numerator of corrcoef
    numerator = np.multiply(b1, b2).mean(axis=axes, keepdims=False)
    # denomenator of corrcoef
    dof = np.prod( b1.shape[slice(axes[0], axes[-1]+1)] )
    b1_std = np.sqrt( (b1**2).mean(axis=axes, keepdims=False) / dof )
    b2_std = np.sqrt( (b2**2).mean(axis=axes, keepdims=False) / dof )
    denominator = np.multiply(b1_std, b2_std)
    # divide
    out = np.divide(numerator, denominator)
    return out


if __name__ == '__main__':
    f1 = h5py.File("test.h5", "r")
    f2 = h5py.File("test2.h5", "r")
    arr1 = da.from_array(f1["arr"])
    arr2 = da.from_array(f2["arr"])

    block_shape = (10, 10)

    with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof,\
            ProgressBar():
        out = da.map_blocks(corrcoef, arr1, arr2, block_shape,
                chunks=(400, 400))
        da.to_hdf5("out.h5", "/arr", out)
    visualize([prof, rprof])
Ejemplo n.º 40
0
os.chdir("/home/ubuntu/observations")
new = xr.open_dataset("observations.nc")
anomalies_obs = new.air[1320:1632, :, :]
anomalies_obs = anomalies_obs.reindex(lat=new['lat'],
                                      lon=new['lon'],
                                      method='nearest')
obs = anomalies_obs

anomaly_obs_slopes = np.zeros(
    [anomalies_obs.lat.shape[0], anomalies_obs.lon.shape[0]])

anomalies_obs_da = da.from_array(anomalies_obs.data, chunks=[312, 45, 45])

anomaly_obs_slopes = da.map_blocks(my_linregress,
                                   anomalies_obs_da,
                                   dtype=np.ndarray,
                                   drop_axis=[0])

anomaly_obs_slopes = anomaly_obs_slopes.compute(num_workers=num_workers)

print('Linear regression slopes:')
print(anomaly_obs_slopes)
print('Units are temperature change for each data point in degrees/year.')

end1 = time.time()
section1 = end1 - start1
print(section1)

######################################################################################################
'''
SECTION 2 - COMPUTING MODEL MEAN (1950-1980)
Ejemplo n.º 41
0
def map_overlap(func,
                *args,
                depth=None,
                boundary=None,
                trim=True,
                align_arrays=True,
                **kwargs):
    """ Map a function over blocks of arrays with some overlap

    We share neighboring zones between blocks of the array, map a
    function, and then trim away the neighboring strips.

    Parameters
    ----------
    func: function
        The function to apply to each extended block
    args : dask arrays
    depth: int, tuple, dict or list
        The number of elements that each block should share with its neighbors
        If a tuple or dict then this can be different per axis.
        If a list then each element of that list must be an int, tuple or dict
        defining depth for the corresponding array in `args`.
        Asymmetric depths may be specified using a dict value of (-/+) tuples.
        Note that asymmetric depths are currently only supported when
        ``boundary`` is 'none'.
        The default value is 0.
    boundary: str, tuple, dict or list
        How to handle the boundaries.
        Values include 'reflect', 'periodic', 'nearest', 'none',
        or any constant value like 0 or np.nan.
        If a list then each element must be a str, tuple or dict defining the
        boundary for the corresponding array in `args`.
        The default value is 'reflect'.
    trim: bool
        Whether or not to trim ``depth`` elements from each block after
        calling the map function.
        Set this to False if your mapping function already does this for you
    align_arrays: bool
        Whether or not to align chunks along equally sized dimensions when
        multiple arrays are provided.  This allows for larger chunks in some
        arrays to be broken into smaller ones that match chunk sizes in other
        arrays such that they are compatible for block function mapping. If
        this is false, then an error will be thrown if arrays do not already
        have the same number of blocks in each dimensions.
    **kwargs:
        Other keyword arguments valid in ``map_blocks``

    Examples
    --------
    >>> import numpy as np
    >>> import dask.array as da

    >>> x = np.array([1, 1, 2, 3, 3, 3, 2, 1, 1])
    >>> x = da.from_array(x, chunks=5)
    >>> def derivative(x):
    ...     return x - np.roll(x, 1)

    >>> y = x.map_overlap(derivative, depth=1, boundary=0)
    >>> y.compute()
    array([ 1,  0,  1,  1,  0,  0, -1, -1,  0])

    >>> x = np.arange(16).reshape((4, 4))
    >>> d = da.from_array(x, chunks=(2, 2))
    >>> d.map_overlap(lambda x: x + x.size, depth=1).compute()
    array([[16, 17, 18, 19],
           [20, 21, 22, 23],
           [24, 25, 26, 27],
           [28, 29, 30, 31]])

    >>> func = lambda x: x + x.size
    >>> depth = {0: 1, 1: 1}
    >>> boundary = {0: 'reflect', 1: 'none'}
    >>> d.map_overlap(func, depth, boundary).compute()  # doctest: +NORMALIZE_WHITESPACE
    array([[12,  13,  14,  15],
           [16,  17,  18,  19],
           [20,  21,  22,  23],
           [24,  25,  26,  27]])
    """
    # Look for invocation using deprecated single-array signature
    # map_overlap(x, func, depth, boundary=None, trim=True, **kwargs)
    if isinstance(func, Array) and callable(args[0]):
        warnings.warn(
            "Detected use of signature map_overlap(x, func) rather than "
            "map_overlap(func, *args) for multi-array support. Arguments "
            "will be swapped in this case but such an exception will not "
            "be made in a future release.",
            FutureWarning,
        )
        sig = ["func", "depth", "boundary", "trim"]
        depth = get(sig.index("depth"), args, depth)
        boundary = get(sig.index("boundary"), args, boundary)
        trim = get(sig.index("trim"), args, trim)
        func, args = args[0], [func]

    if not callable(func):
        raise TypeError("First argument must be callable function, not {}\n"
                        "Usage:   da.map_overlap(function, x)\n"
                        "   or:   da.map_overlap(function, x, y, z)".format(
                            type(func).__name__))
    if not all(isinstance(x, Array) for x in args):
        raise TypeError("All variadic arguments must be arrays, not {}\n"
                        "Usage:   da.map_overlap(function, x)\n"
                        "   or:   da.map_overlap(function, x, y, z)".format(
                            [type(x).__name__ for x in args]))

    # Coerce depth and boundary arguments to lists of individual
    # specifications for each array argument
    def coerce(xs, arg, fn):
        if not isinstance(arg, list):
            arg = [arg] * len(xs)
        return [fn(x.ndim, a) for x, a in zip(xs, arg)]

    depth = coerce(args, depth, coerce_depth)
    boundary = coerce(args, boundary, coerce_boundary)

    # Align chunks in each array to a common size
    if align_arrays:
        # Reverse unification order to allow block broadcasting
        inds = [list(reversed(range(x.ndim))) for x in args]
        _, args = da.core.unify_chunks(*list(concat(zip(args, inds))),
                                       warn=False)

    for i, x in enumerate(args):
        for j in range(x.ndim):
            if isinstance(depth[i][j], tuple) and boundary[i][j] != "none":
                raise NotImplementedError(
                    "Asymmetric overlap is currently only implemented "
                    "for boundary='none', however boundary for dimension "
                    "{} in array argument {} is {}".format(
                        j, i, boundary[i][j]))

    def assert_int_chunksize(xs):
        assert all(type(c) is int for x in xs for cc in x.chunks for c in cc)

    assert_int_chunksize(args)
    args = [
        overlap(x, depth=d, boundary=b)
        for x, d, b in zip(args, depth, boundary)
    ]
    assert_int_chunksize(args)
    x = da.map_blocks(func, *args, **kwargs)
    assert_int_chunksize([x])
    if trim:
        # Find index of array argument with maximum rank and break ties by choosing first provided
        i = sorted(enumerate(args), key=lambda v: (v[1].ndim, -v[0]))[-1][0]
        # Trim using depth/boundary setting for array of highest rank
        return trim_internal(x, depth[i], boundary[i])
    else:
        return x
Ejemplo n.º 42
0
    def run(
        self,
        pars,
        sims,
        sim_status,
        indices,
        collect_in_memory: bool = True,
        batch_size: Optional[int] = None,
    ):
        """Run the simulator on the input parameters.

        Args:
            pars: array with all the input parameters. Should have shape
                (num. samples, num. parameters)
            sims: dictionary of arrays where to store the simulation output.
                All arrays should have the number of samples as the size of the
                first dimension
            sim_status: array where to store the simulation status (size should
                be equal to the number of samples)
            indices: indices of the samples that need to be run by the
                simulator
            collect_in_memory: if True, collect the simulation output in
                memory; if False, instruct Dask workers to save the output to
                the corresponding arrays. The latter option is asynchronous,
                thus this method immediately returns.
            batch_size: simulations will be submitted in batches of the
                specified size
        """
        self.set_dask_cluster(self.cluster)

        # open parameter array as Dask array
        chunks = getattr(pars, "chunks", "auto")
        z = da.from_array(pars, chunks=chunks)
        idx = da.from_array(indices, chunks=(batch_size or -1, ))
        z = z[idx]

        z = z.persist()  # load the parameters in the distributed memory

        # block-wise run the model function on the parameter array
        out = da.map_blocks(
            _run_model_chunk,
            z,
            model=self.model,
            sim_shapes=self.sim_shapes,
            fail_on_non_finite=self.fail_on_non_finite,
            drop_axis=1,
            dtype=np.object,
        )

        # FIXME: Deprecated?
        #        print("Simulator: Running...")
        #        bag = db.from_sequence(z, npartitions=npartitions)
        #        bag = bag.map(_run_one_sample, self.model, self.fail_on_non_finite)
        #        result = bag.compute(scheduler=self.client or "processes")
        #        print("Simulator: ...done.")
        #        return result

        # split result dictionary and simulation status array
        results = out.map_blocks(getitem, 0, dtype=np.object)
        status = out.map_blocks(getitem, 1, meta=np.array(()), dtype=np.int)

        # unpack array of dictionaries to dictionary of arrays
        result_dict = {}
        for obs, shape in self.sim_shapes.items():
            result_dict[obs] = results.map_blocks(
                getitem,
                obs,
                new_axis=[i + 1 for i in range(len(shape))],
                chunks=(z.chunks[0], *shape),
                meta=np.array(()),
                dtype=np.float,
            )

        sources = [result_dict[k] for k in self.sim_shapes.keys()]
        targets = [sims[k] for k in self.sim_shapes.keys()]

        if collect_in_memory:
            # submit computation and collect results
            *sources, status = self.client.compute([*sources, status],
                                                   sync=True)

            # update simulation results
            for source, target in zip(sources, targets):
                target[indices.tolist()] = source

            # finally, update the simulation status
            sim_status[indices.tolist()] = status

        else:
            sources = da.store(
                sources=sources,
                targets=targets,
                regions=(indices.tolist(), ),
                lock=False,
                compute=False,
                return_stored=True,
            )

            # submit computation
            *sources, status = self.client.persist([*sources, status])

            # the following dummy array is generated after results are stored.
            zeros_when_done = [
                source.map_blocks(
                    lambda x: np.zeros(x.shape[0], dtype=np.int),
                    chunks=(source.chunks[0], ),
                    drop_axis=[i for i in range(1, source.ndim)],
                    meta=np.array((), dtype=np.int),
                    dtype=np.int,
                ) for source in sources
            ]
            status = sum([*zeros_when_done, status])
            status = status.store(
                target=sim_status,
                regions=(indices.tolist(), ),
                lock=False,
                compute=False,
                return_stored=True,
            )
            # when the simulation results are stored, we can update the status
            status = self.client.persist(status)
            fire_and_forget(status)
Ejemplo n.º 43
0
def int_geomedian(ds, scale=1, offset=0, wk_rows=-1, as_array=False, **kw):
    """ds -- xr.Dataset (possibly dask) with dims: (time, y, x) for each band

        on output time dimension is removed

    :param ds: Dataset with int data variables
    :param scale: Normalize data for running computation (output is scaled back to original values)
    :param offset: ``(x*scale + offset)``
    :param wk_rows: reduce memory requirements by processing that many rows of a chunk at a time
    :param as_array: If set to True return DataArray with band dimension instead of Dataset
    :param kw: Passed on to hdstats (eps=1e-4, num_threads=1, maxiters=10_000, nocheck=True)

    """
    band_names = [dv.name for dv in ds.data_vars.values()]
    xx, *_ = ds.data_vars.values()
    nodata = getattr(xx, "nodata", None)

    is_dask = dask.is_dask_collection(xx)
    if is_dask:
        if xx.data.chunksize[0] != xx.shape[0]:
            ds = ds.chunk(chunks={xx.dims[0]: -1})
            xx, *_ = ds.data_vars.values()

    nt, ny, nx = xx.shape
    bands = [dv.data for dv in ds.data_vars.values()]
    band = bands[0]
    nb = len(bands)
    dtype = band.dtype

    kw.setdefault("nocheck", True)
    kw.setdefault("num_threads", 1)
    kw.setdefault("eps", 1e-4)
    kw.setdefault("maxiters", 10_000)

    if is_dask:
        chunks = ((nb, ), *xx.chunks[1:])

        data = da.map_blocks(
            int_geomedian_np,
            *bands,
            nodata=nodata,
            scale=scale,
            offset=offset,
            wk_rows=wk_rows,
            **kw,
            name=randomize("geomedian"),
            dtype=dtype,
            chunks=chunks,
            drop_axis=[0],  # time is dropped
            new_axis=[0],
        )  # band is added on the left
    else:
        data = int_geomedian_np(*bands,
                                nodata=nodata,
                                scale=scale,
                                offset=offset,
                                wk_rows=wk_rows,
                                **kw)

    dims = ("band", *xx.dims[1:])
    cc = {k: xx.coords[k] for k in dims[1:]}
    cc["band"] = band_names

    da_out = xr.DataArray(data, dims=dims, coords=cc)

    if as_array:
        if nodata is not None:
            da_out.attrs["nodata"] = nodata
        return da_out

    ds_out = da_out.to_dataset(dim="band")
    ds_out.attrs.update(ds.attrs)
    for b in ds.data_vars.keys():
        src, dst = ds[b], ds_out[b]
        dst.attrs.update(src.attrs)

    return ds_out
Ejemplo n.º 44
0
def geomedian_with_mads(
    src: Union[xr.Dataset, xr.DataArray],
    compute_mads: bool = True,
    compute_count: bool = True,
    out_chunks: Optional[Tuple[int, int, int]] = None,
    reshape_strategy: str = "mem",
    scale: float = 1.0,
    offset: float = 0.0,
    eps: Optional[float] = None,
    maxiters: int = 1000,
    num_threads: int = 1,
    **kw,
) -> xr.Dataset:
    """
    Compute Geomedian on Dask backed Dataset.

    NOTE: Default configuration of this code assumes that entire input can be
    loaded in to RAM on the Dask worker. It also assumes that there is only one
    worker in the cluster, or that entire task will get scheduled on one single
    worker only. See ``reshape_strategy`` parameter.

    :param src: xr.Dataset or a single array in YXBT order, bands can be either
                float or integer with `nodata` values to indicate gaps in data.

    :param compute_mads: Whether to compute smad,emad,bcmad statistics

    :param compute_count: Whether to compute count statistic (number of
                          contributing observations per output pixels)

    :param out_chunks: Advanced option, allows to rechunk output internally,
                       order is ``(ny, nx, nband)``

    :param reshape_strategy: One of ``mem`` (default) or ``yxbt``. This is only
    applicable when supplying Dataset object. It controls how Dataset is
    reshaped into DataArray in the format expected by Geomedian code. If you
    have enough RAM and use single-worker Dask cluster, then use ``mem``, it
    should be the most efficient. If there is not enough RAM to load entire
    input you can try ``yxbt`` mode, but you might still run out of RAM anyway.
    If using multi-worker Dask cluster you have to use ``yxbt`` strategy.

    :param scale, offset: Only used when input contains integer values, actual
                          Geomedian will run on scaled values
                          ``scale*X+offset``. Only affects internal
                          computation, final result is scaled back to the
                          original value range.

    :param eps: Termination criteria passed on to geomedian algorithm

    :param maxiters: Maximum number of iterations done per output pixel

    :param num_threads: Configure internal concurrency of the Geomedian
                        computation. Default is 1 as we assume that Dask will
                        run a bunch of those concurrently.

    :param work_chunks: Default is ``(100, 100)``, only applicable when input
                        is Dataset.
    """
    if not dask.is_dask_collection(src):
        raise ValueError("This method only works on Dask inputs")

    if isinstance(src, xr.DataArray):
        yxbt = src
    else:
        # TODO: better automatic defaults for work_chunks
        ny, nx = kw.get("work_chunks", (100, 100))
        if reshape_strategy == "mem":
            yxbt = yxbt_sink(src, (ny, nx, -1, -1))
        elif reshape_strategy == "yxbt":
            yxbt = reshape_yxbt(src, yx_chunks=(ny, nx))
        else:
            raise ValueError(
                f"Reshape strategy '{reshape_strategy}' not understood use one of: mem or yxbt"
            )

    ny, nx, nb, nt = yxbt.shape
    nodata = yxbt.attrs.get("nodata", None)
    assert yxbt.chunks is not None
    if yxbt.data.numblocks[2:4] != (1, 1):
        raise ValueError(
            "There should be one dask block along time and band dimension")

    n_extras = (3 if compute_mads else 0) + (1 if compute_count else 0)
    chunks = (*yxbt.chunks[:2], (nb + n_extras, ))

    is_float = yxbt.dtype.kind == "f"

    if eps is None:
        eps = 1e-4 if is_float else 0.1 * scale

    op = functools.partial(
        _gm_mads_compute_f32,
        compute_mads=compute_mads,
        compute_count=compute_count,
        nodata=nodata,
        scale=scale,
        offset=offset,
        eps=eps,
        maxiters=maxiters,
        num_threads=num_threads,
    )

    _gm = da.map_blocks(op,
                        yxbt.data,
                        dtype="float32",
                        drop_axis=3,
                        chunks=chunks,
                        name="geomedian")
    if out_chunks is not None:
        _gm = _gm.rechunk(out_chunks)

    gm_data = _gm[:, :, :nb]
    if not is_float:
        gm_data = da.map_blocks(
            lambda x: from_float_np(
                x, yxbt.dtype, nodata, scale=1 / scale, offset=offset / scale),
            gm_data,
            dtype=yxbt.dtype,
        )

    dims = yxbt.dims[:3]
    coords = {k: yxbt.coords[k] for k in dims}
    result = xr.DataArray(data=gm_data,
                          dims=dims,
                          coords=coords,
                          attrs=yxbt.attrs).to_dataset("band")

    for dv in result.data_vars.values():
        dv.attrs.update(yxbt.attrs)

    next_stat = nb
    if compute_mads:
        smad = _gm[:, :, next_stat + 0]
        emad = _gm[:, :, next_stat + 1]
        bcmad = _gm[:, :, next_stat + 2]
        next_stat += 3

        if not is_float:
            emad = emad * (1 / scale)

        result["smad"] = xr.DataArray(data=smad,
                                      dims=dims[:2],
                                      coords=result.coords)
        result["emad"] = xr.DataArray(data=emad,
                                      dims=dims[:2],
                                      coords=result.coords)
        result["bcmad"] = xr.DataArray(data=bcmad,
                                       dims=dims[:2],
                                       coords=result.coords)

    if compute_count:
        count = _gm[:, :, next_stat].astype("uint16")
        next_stat += 1
        result["count"] = xr.DataArray(data=count,
                                       dims=dims[:2],
                                       coords=result.coords)

    return result
Ejemplo n.º 45
0
def run_crefl(refl,
              coeffs,
              lon,
              lat,
              sensor_azimuth,
              sensor_zenith,
              solar_azimuth,
              solar_zenith,
              avg_elevation=None,
              percent=False,
              use_abi=False):
    """Run main crefl algorithm.

    All input parameters are per-pixel values meaning they are the same size
    and shape as the input reflectance data, unless otherwise stated.

    :param reflectance_bands: tuple of reflectance band arrays
    :param coefficients: tuple of coefficients for each band (see `get_coefficients`)
    :param lon: input swath longitude array
    :param lat: input swath latitude array
    :param sensor_azimuth: input swath sensor azimuth angle array
    :param sensor_zenith: input swath sensor zenith angle array
    :param solar_azimuth: input swath solar azimuth angle array
    :param solar_zenith: input swath solar zenith angle array
    :param avg_elevation: average elevation (usually pre-calculated and stored in CMGDEM.hdf)
    :param percent: True if input reflectances are on a 0-100 scale instead of 0-1 scale (default: False)

    """
    # FUTURE: Find a way to compute the average elevation before hand
    # Get digital elevation map data for our granule, set ocean fill value to 0
    if avg_elevation is None:
        LOG.debug("No average elevation information provided in CREFL")
        #height = np.zeros(lon.shape, dtype=np.float)
        height = 0.
    else:
        LOG.debug("Using average elevation information provided to CREFL")
        lat[(lat <= -90) | (lat >= 90)] = np.nan
        lon[(lon <= -180) | (lon >= 180)] = np.nan
        row = ((90.0 - lat) * avg_elevation.shape[0] / 180.0).astype(np.int32)
        col = ((lon + 180.0) * avg_elevation.shape[1] / 360.0).astype(np.int32)
        space_mask = da.isnull(lon) | da.isnull(lat)
        row[space_mask] = 0
        col[space_mask] = 0

        def _avg_elevation_index(avg_elevation, row, col):
            return avg_elevation[row, col]

        height = da.map_blocks(_avg_elevation_index,
                               avg_elevation,
                               row,
                               col,
                               dtype=avg_elevation.dtype)
        height = xr.DataArray(height, dims=['y', 'x'])
        # negative heights aren't allowed, clip to 0
        height = height.where((height >= 0.) & ~space_mask, 0.0)
        del lat, lon, row, col
    mus = da.cos(da.deg2rad(solar_zenith))
    mus = mus.where(mus >= 0)
    muv = da.cos(da.deg2rad(sensor_zenith))
    phi = solar_azimuth - sensor_azimuth

    if use_abi:
        LOG.debug("Using ABI CREFL algorithm")
        a_O3 = [268.45, 0.5, 115.42, -3.2922]
        a_H2O = [0.0311, 0.1, 92.471, -1.3814]
        a_O2 = [0.4567, 0.007, 96.4884, -1.6970]
        G_O3 = G_calc(solar_zenith, a_O3) + G_calc(sensor_zenith, a_O3)
        G_H2O = G_calc(solar_zenith, a_H2O) + G_calc(sensor_zenith, a_H2O)
        G_O2 = G_calc(solar_zenith, a_O2) + G_calc(sensor_zenith, a_O2)
        # Note: bh2o values are actually ao2 values for abi
        sphalb, rhoray, TtotraytH2O, tOG = get_atm_variables_abi(
            mus, muv, phi, height, G_O3, G_H2O, G_O2, *coeffs)
    else:
        LOG.debug("Using original VIIRS CREFL algorithm")
        sphalb, rhoray, TtotraytH2O, tOG = get_atm_variables(
            mus, muv, phi, height, *coeffs)

    del solar_azimuth, solar_zenith, sensor_zenith, sensor_azimuth
    # Note: Assume that fill/invalid values are either NaN or we are dealing
    # with masked arrays
    if percent:
        corr_refl = ((refl / 100.) / tOG - rhoray) / TtotraytH2O
    else:
        corr_refl = (refl / tOG - rhoray) / TtotraytH2O
    corr_refl /= (1.0 + corr_refl * sphalb)
    return corr_refl.clip(REFLMIN, REFLMAX)
Ejemplo n.º 46
0
def convert_to_pfts(category_cube, conversion, min_category, max_category):
    """Convert landcover categories to PFT fractions using a given conversion table.

    Args:
        category_cube (iris.cube.Cube): Cube containing the landcover categories.
        conversion (dict): Conversion factors from categories to PFT fractions.
        min_category (int): Minimum possible land cover category index (inclusive).
        max_category (int): Maximum possible land cover category index (inclusive).

    Returns:
        iris.cube.CubeList: Cubes containing the PFTs on the same grid as
            `category_cube`.

    """
    if not category_cube.has_lazy_data():
        raise ValueError("Source cube needs to have lazy data.")

    pft_names = get_mapping_pfts(conversion)
    array_mapping = get_mapping_arrays(pft_names, conversion)

    n_pfts = next(iter(array_mapping.values()))["pfts"].size
    if not all(values["pfts"].size == n_pfts
               for values in array_mapping.values()):
        raise ValueError(
            "All categories need to map on to the same number of PFT fractions."
        )

    # Simple array structure containing the mapping from landcover categories to PFTs in a
    # way that is easier to accelerate.
    structured_mapping = np.zeros((max_category - min_category + 1, n_pfts),
                                  dtype=np.uint8)
    for landcover_index in range(min_category, max_category + 1):
        if landcover_index in array_mapping:
            structured_mapping[landcover_index] = array_mapping[
                landcover_index]["pfts"]
        else:
            structured_mapping[landcover_index] = np.zeros(n_pfts,
                                                           dtype=np.uint8)

    @parallel_njit
    def _execute_mapping(category, structured_mapping, n_pfts):
        """Carry out conversion to PFT fractions."""
        pfts = np.zeros((*category.shape, *(n_pfts, )))
        for index in np.ndindex(category.shape):
            pfts[index] = structured_mapping[category[index]]
        return pfts

    pft_data = da.map_blocks(
        _execute_mapping,
        category_cube.core_data(),
        structured_mapping=structured_mapping,
        n_pfts=n_pfts,
        meta=np.array([], dtype=np.uint8),
        # We are only adding a dimension with size `n_pfts`. All other chunks remain.
        chunks=(*category_cube.core_data().chunks, (n_pfts, )),
        new_axis=category_cube.ndim,
        dtype=np.uint8,
    )

    cubes = iris.cube.CubeList()
    for i, pft_name in enumerate(pft_names):
        pft_cube = category_cube.copy(data=pft_data[..., i])
        pft_cube.var_name = None
        pft_cube.standard_name = None
        pft_cube.long_name = pft_name
        pft_cube.units = "1"
        cubes.append(pft_cube)
    return cubes
Ejemplo n.º 47
0
def linear_regression(XL: ArrayLike, XC: ArrayLike,
                      Y: ArrayLike) -> LinearRegressionResult:
    """Efficient linear regression estimation for multiple covariate sets

    Parameters
    ----------
    XL
        [array-like, shape: (M, N)]
        "Loop" covariates for which N separate regressions will be run
    XC
        [array-like, shape: (M, P)]
        "Core" covariates included in the regressions for each loop
        covariate. All P core covariates are used in each of the N
        loop covariate regressions.
    Y
        [array-like, shape: (M, O)]
        Continuous outcomes

    Returns
    -------
    Dataclass containing:

    beta : [array-like, shape: (N, O)]
        Beta values associated with each loop covariate and outcome
    t_value : [array-like, shape: (N, O)]
        T statistics for each beta
    p_value : [array-like, shape: (N, O)]
        P values as float in [0, 1]
    """
    XL, XC = da.asarray(XL), da.asarray(XC)  # Coerce for `lstsq`
    if set([x.ndim for x in [XL, XC, Y]]) != {2}:
        raise ValueError("All arguments must be 2D")
    n_core_covar, n_loop_covar, n_obs, n_outcome = (
        XC.shape[1],
        XL.shape[1],
        Y.shape[0],
        Y.shape[1],
    )
    dof = n_obs - n_core_covar - 1
    if dof < 1:
        raise ValueError(
            "Number of observations (N) too small to calculate sampling statistics. "
            "N must be greater than number of core covariates (C) plus one. "
            f"Arguments provided: N={n_obs}, C={n_core_covar}.")

    # Apply orthogonal projection to eliminate core covariates
    # Note: QR factorization or SVD should be used here to find
    # what are effectively OLS residuals rather than matrix inverse
    # to avoid need for MxM array; additionally, dask.lstsq fails
    # with numpy arrays
    XLP = XL - da.dot(XC, da.linalg.lstsq(XC, XL)[0])
    assert XLP.shape == (n_obs, n_loop_covar)
    YP = Y - da.dot(XC, da.linalg.lstsq(XC, Y)[0])
    assert YP.shape == (n_obs, n_outcome)

    # Estimate coefficients for each loop covariate
    # Note: A key assumption here is that 0-mean residuals
    # from projection require no extra terms in variance
    # estimate for loop covariates (columns of G), which is
    # only true when an intercept is present.
    XLPS = (XLP**2).sum(axis=0, keepdims=True).T
    assert XLPS.shape == (n_loop_covar, 1)
    B = da.dot(XLP.T, YP) / XLPS
    assert B.shape == (n_loop_covar, n_outcome)

    # Compute residuals for each loop covariate and outcome separately
    YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...]
    assert YR.shape == (n_obs, n_loop_covar, n_outcome)
    RSS = (YR**2).sum(axis=0)
    assert RSS.shape == (n_loop_covar, n_outcome)
    # Get t-statistics for coefficient estimates
    T = B / np.sqrt(RSS / dof / XLPS)
    assert T.shape == (n_loop_covar, n_outcome)
    # Match to p-values
    # Note: t dist not implemented in Dask so this must be delayed,
    # see https://github.com/dask/dask/issues/6857
    P = da.map_blocks(lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof),
                      T,
                      dtype="float64")
    assert P.shape == (n_loop_covar, n_outcome)

    return LinearRegressionResult(beta=B, t_value=T, p_value=P)
Ejemplo n.º 48
0
def map_blocks(func, *args, **kwargs):
    array = args[0]
    if isinstance(array, da.Array):
        return da.map_blocks(func, *args, **kwargs)
    else:
        return func(*args)
Ejemplo n.º 49
0
def _ridge_regression_cv(
    X: Array, Y: Array, alphas: NDArray, n_zero_reg: Optional[int] = None
) -> Tuple[Array, Array, Array, Array]:
    assert alphas.ndim == 1
    assert X.ndim == 2
    assert Y.ndim == 2
    assert X.numblocks[1] == 1
    assert Y.numblocks[1] == 1
    assert X.chunks[0] == Y.chunks[0]
    n_block, n_obs, n_covar, n_outcome, n_alpha = (
        X.numblocks[0],
        X.shape[0],
        X.shape[1],
        Y.shape[1],
        alphas.shape[0],
    )
    obs_chunks = X.chunks[0]

    # Project samples and outcomes noting that resulting chunks are
    # of fixed size even if the chunks along the observation dim
    # are not uniform (i.e. |X.chunks[0]| != 1)
    XtX = stack(da.map_blocks(lambda x: x.T @ x, X, chunks=(X.shape[1],) * 2))
    assert_block_shape(XtX, n_block, 1, 1)
    assert_chunk_shape(XtX, 1, n_covar, n_covar)
    XtY = stack(da.map_blocks(lambda x, y: x.T @ y, X, Y, chunks=(n_covar, n_outcome)))
    assert_block_shape(XtY, n_block, 1, 1)
    assert_chunk_shape(XtY, 1, n_covar, n_outcome)

    # Invert the projections in each block so that each
    # contains data from all other blocks *except* itself
    XtX = unstack(XtX.sum(axis=0) - XtX)
    assert_block_shape(XtX, n_block, 1)
    assert_chunk_shape(XtX, n_covar, n_covar)
    XtY = unstack(XtY.sum(axis=0) - XtY)
    assert_block_shape(XtY, n_block, 1)
    assert_chunk_shape(XtY, n_covar, n_outcome)
    assert XtX.numblocks == XtY.numblocks

    # Regress for all outcomes/alphas and add new axis for ridge parameters
    B = da.map_blocks(
        ridge_regression,
        XtX,
        XtY,
        chunks=(n_alpha, n_covar, n_outcome),
        new_axis=[0],
        alphas=alphas,
        n_zero_reg=n_zero_reg,
        meta=da.utils.meta_from_array(XtX),
    )
    assert_block_shape(B, 1, n_block, 1)
    assert_chunk_shape(B, n_alpha, n_covar, n_outcome)
    assert_array_shape(B, n_alpha, n_block * n_covar, n_outcome)

    # Generate predictions for all outcomes/alphas
    assert B.numblocks == (1,) + X.numblocks
    YP = da.map_blocks(
        lambda x, b: x @ b, X, B, chunks=(alphas.size, obs_chunks, n_outcome)
    )
    assert_block_shape(YP, 1, n_block, 1)
    assert_chunk_shape(YP, n_alpha, obs_chunks[0], n_outcome)
    assert_array_shape(YP, n_alpha, n_obs, n_outcome)

    return XtX, XtY, B, YP
Ejemplo n.º 50
0
def divergence(
    ds: Dataset,
    *,
    cohort_allele_count: Hashable = variables.cohort_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute divergence between pairs of cohorts.

    The entry at (i, j) is the divergence between for cohort i and cohort j,
    except for the case where i and j are the same, in which case the entry
    is the diversity for cohort i.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    cohort_allele_count
        Cohort allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.cohort_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_cohort_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the divergence value between pairs of cohorts, as defined by
    :data:`sgkit.variables.stat_divergence_spec`.
    Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing
    information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[0.5       , 0.5       ],
            [0.5       , 0.66666667]],
    <BLANKLINE>
        [[0.66666667, 0.5       ],
            [0.5       , 0.5       ]],
    <BLANKLINE>
        [[0.66666667, 0.5       ],
            [0.5       , 0.66666667]],
    <BLANKLINE>
        [[0.5       , 0.375     ],
            [0.375     , 0.5       ]],
    <BLANKLINE>
        [[0.5       , 0.625     ],
            [0.625     , 0.5       ]]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[1.83333333, 1.5       ],
            [1.5       , 1.83333333]],
    <BLANKLINE>
        [[1.        , 1.        ],
            [1.        , 1.        ]]])
    """

    ds = define_variable_if_absent(ds, variables.cohort_allele_count,
                                   cohort_allele_count, count_cohort_alleles)
    variables.validate(
        ds, {cohort_allele_count: variables.cohort_allele_count_spec})
    ac = ds[cohort_allele_count]

    n_variants = ds.dims["variants"]
    n_cohorts = ds.dims["cohorts"]
    ac = da.asarray(ac)
    shape = (ac.chunks[0], n_cohorts, n_cohorts)
    d = da.map_blocks(_divergence, ac, chunks=shape, dtype=np.float64)
    assert_array_shape(d, n_variants, n_cohorts, n_cohorts)

    if has_windows(ds):
        div = window_statistic(
            d,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=d.dtype,
            axis=0,
        )
        new_ds = create_dataset({
            variables.stat_divergence: (
                ("windows", "cohorts_0", "cohorts_1"),
                div,
            )
        })
    else:
        new_ds = create_dataset({
            variables.stat_divergence: (
                ("variants", "cohorts_0", "cohorts_1"),
                d,
            )
        })
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 51
0
def _stage_2(
    YP: Array,
    X: Array,
    Y: Array,
    alphas: Optional[NDArray] = None,
    normalize: bool = True,
    _glow_adj_alpha: bool = False,
    _glow_adj_scaling: bool = False,
) -> Tuple[Array, Array]:
    """Stage 2 - WGR Meta Regression

    This stage will train separate ridge regression models for each outcome
    using the predictions from stage 1 for that same outcome as features. These
    predictions are then evaluated based on R2 score to determine an optimal
    "meta" estimator (see `_stage_1` for the "base" estimator description). Results
    then include only predictions and coefficients from this optimal model.

    For more details, see the level 1 regression model described in step 1
    of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
    """
    assert YP.ndim == 4
    assert X.ndim == 2
    assert Y.ndim == 2
    # Check that chunking across samples is the same for all arrays
    assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0]
    assert YP.chunks[2] == X.chunks[0] == Y.chunks[0]
    # Assert single chunks for covariates and outcomes
    assert X.numblocks[1] == Y.numblocks[1] == 1
    # Extract shape statistics
    n_variant_block, n_alpha_1 = YP.shape[:2]
    n_sample_block = Y.numblocks[0]
    n_sample, n_outcome = Y.shape
    n_covar = X.shape[1]
    n_indvar = n_covar + n_variant_block * n_alpha_1
    sample_chunks = Y.chunks[0]

    if normalize:
        assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1)
        assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome)
        # See: https://github.com/projectglow/glow/issues/260
        if _glow_adj_scaling:
            YP = da.map_blocks(
                lambda x: (x - x.mean(axis=2, keepdims=True))
                / x.std(axis=2, keepdims=True),
                YP,
            )
        else:
            YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True)
    # Tranpose for refit on level 1 predictions
    YP = YP.transpose((3, 2, 0, 1))
    assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1)

    if alphas is None:
        # See: https://github.com/projectglow/glow/issues/255
        if _glow_adj_alpha:
            alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome)
        else:
            alphas = get_alphas(n_variant_block * n_alpha_1)
    n_alpha_2 = alphas.size

    YR = []
    BR = []
    for i in range(n_outcome):
        # Slice and reshape to new 2D covariate matrix;
        # The order of raveling in trailing dimensions is important
        # and later reshapes will assume variants, alphas order
        XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1))
        # Prepend covariates and chunk along first dim only
        XPB = da.concatenate((X, XPB), axis=1)
        XPB = XPB.rechunk(chunks=(None, -1))
        assert_array_shape(XPB, n_sample, n_indvar)
        assert XPB.numblocks == (n_sample_block, 1)
        # Extract outcome vector
        YB = Y[:, [i]]
        assert XPB.ndim == YB.ndim == 2
        # Fit and predict folds for each parameter
        BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:]
        assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1)
        assert_array_shape(YPB, n_alpha_2, n_sample, 1)
        BR.append(BB)
        YR.append(YPB)

    # Concatenate predictions along outcome dimension
    YR = da.concatenate(YR, axis=2)
    assert_block_shape(YR, 1, n_sample_block, n_outcome)
    assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1)
    assert_array_shape(YR, n_alpha_2, n_sample, n_outcome)
    # Move samples to last dim so all others are batch
    # dims for R2 calculations
    YR = da.transpose(YR, (0, 2, 1))
    assert_array_shape(YR, n_alpha_2, n_outcome, n_sample)
    YR = YR.rechunk((-1, -1, None))
    assert_block_shape(YR, 1, 1, n_sample_block)
    assert YR.shape[1:] == Y.T.shape

    # Concatenate betas along outcome dimension
    BR = da.concatenate(BR, axis=2)
    assert_block_shape(BR, 1, n_sample_block, n_outcome)
    assert_chunk_shape(BR, n_alpha_2, n_indvar, 1)
    assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome)

    # Compute R2 scores within each sample block for each outcome + alpha
    R2 = da.stack(
        [
            r2_score(YR.blocks[..., i], Y.T.blocks[..., i])
            # Avoid warnings on R2 calculations for blocks with single rows
            if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan)
            for i in range(n_sample_block)
        ]
    )
    assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome)
    # Coerce to finite or nan before nan-aware mean
    R2 = da.where(da.isfinite(R2), R2, np.nan)
    # Find highest mean alpha score for each outcome across blocks
    R2M = da.nanmean(R2, axis=0)
    assert_array_shape(R2M, n_alpha_2, n_outcome)
    # Identify index for the alpha value with the highest mean score
    R2I = da.argmax(R2M, axis=0)
    assert_array_shape(R2I, n_outcome)

    # Choose the predictions corresponding to the model with best score
    YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1)
    YRM = YRM.rechunk((None, -1))
    assert_block_shape(YRM, n_sample_block, 1)
    assert_chunk_shape(YRM, sample_chunks[0], n_outcome)
    assert_array_shape(YRM, n_sample, n_outcome)
    # Choose the betas corresponding to the model with the best score
    BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1)
    BRM = BRM.rechunk((None, -1))
    assert_block_shape(BRM, n_sample_block, 1)
    assert_chunk_shape(BRM, n_indvar, n_outcome)
    assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome)
    return BRM, YRM
Ejemplo n.º 52
0
def pbs(
    ds: Dataset,
    *,
    stat_Fst: Hashable = variables.stat_Fst,
    cohorts: Optional[Sequence[Union[Tuple[int, int, int],
                                     Tuple[str, str, str]]]] = None,
    merge: bool = True,
) -> Dataset:
    """Compute the population branching statistic (PBS) between cohort triples.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    stat_Fst
        Fst variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_Fst_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`Fst`.
    cohorts
        The cohort triples to compute statistics for, specified as a sequence of
        tuples of cohort indexes or IDs. None (the default) means compute statistics
        for all cohorts.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the PBS value between cohort triples, as defined by
    :data:`sgkit.variables.stat_pbs_spec`.
    Shape (variants, cohorts, cohorts, cohorts), or
    (windows, cohorts, cohorts, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=6)

    >>> # Divide samples into three named cohorts
    >>> n_cohorts = 3
    >>> sample_cohort = np.repeat(range(n_cohorts), ds.dims["samples"] // n_cohorts)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")
    >>> cohort_names = [f"co_{i}" for i in range(n_cohorts)]
    >>> ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names, "cohorts_2": cohort_names})

    >>> # Divide into two windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.pbs(ds)["stat_pbs"].sel(cohorts_0="co_0", cohorts_1="co_1", cohorts_2="co_2").values # doctest: +NORMALIZE_WHITESPACE
    array([ 0.      , -0.160898])
    """

    ds = define_variable_if_absent(ds, variables.stat_Fst, stat_Fst, Fst)
    variables.validate(ds, {stat_Fst: variables.stat_Fst_spec})

    fst = ds[variables.stat_Fst]
    fst = fst.clip(min=0, max=(1 - np.finfo(float).epsneg))

    t = -np.log(1 - fst)
    n_cohorts = ds.dims["cohorts"]
    n_windows = ds.dims["windows"]
    assert_array_shape(t, n_windows, n_cohorts, n_cohorts)

    # calculate PBS triples
    t = da.asarray(t)
    shape = (t.chunks[0], n_cohorts, n_cohorts, n_cohorts)

    cohorts = cohorts or list(itertools.combinations(range(n_cohorts),
                                                     3))  # type: ignore
    ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts_0", None))

    p = da.map_blocks(lambda t: _pbs_cohorts(t, ct),
                      t,
                      chunks=shape,
                      new_axis=3,
                      dtype=np.float64)
    assert_array_shape(p, n_windows, n_cohorts, n_cohorts, n_cohorts)

    new_ds = create_dataset({
        variables.stat_pbs:
        (["windows", "cohorts_0", "cohorts_1", "cohorts_2"], p)
    })
    return conditional_merge_datasets(ds, new_ds, merge)
Ejemplo n.º 53
0
def hardy_weinberg_test(ds: Dataset,
                        *,
                        genotype_counts: Optional[Hashable] = None,
                        ploidy: Optional[int] = None,
                        alleles: Optional[int] = None,
                        merge: bool = True) -> Dataset:
    """Exact test for HWE as described in Wigginton et al. 2005 [1].

    Parameters
    ----------
    ds
        Dataset containing genotype calls or precomputed genotype counts.
    genotype_counts
        Name of variable containing precomputed genotype counts, by default
        None. If not provided, these counts will be computed automatically
        from genotype calls. If present, must correspond to an (`N`, 3) array
        where `N` is equal to the number of variants and the 3 columns contain
        heterozygous, homozygous reference, and homozygous alternate counts
        (in that order) across all samples for a variant.
    ploidy
        Genotype ploidy, defaults to ``ploidy`` dimension of provided dataset.
        If the `ploidy` dimension is not present, then this value must be set explicitly.
        Currently HWE calculations are only supported for diploid datasets,
        i.e. ``ploidy`` must equal 2.
    alleles
        Genotype allele count, defaults to ``alleles`` dimension of provided dataset.
        If the `alleles` dimension is not present, then this value must be set explicitly.
        Currently HWE calculations are only supported for biallelic datasets,
        i.e. ``alleles`` must equal 2.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Warnings
    --------
    This function is only applicable to diploid, biallelic datasets.

    Returns
    -------
    Dataset containing (N = num variants):

    variant_hwe_p_value : [array-like, shape: (N, O)]
        P values from HWE test for each variant as float in [0, 1].

    References
    ----------
    - [1] Wigginton, Janis E., David J. Cutler, and Goncalo R. Abecasis. 2005.
        “A Note on Exact Tests of Hardy-Weinberg Equilibrium.” American Journal of
        Human Genetics 76 (5): 887–93.

    Raises
    ------
    NotImplementedError
        If ploidy of provided dataset != 2
    NotImplementedError
        If maximum number of alleles in provided dataset != 2
    """
    ploidy = ploidy or ds.dims.get("ploidy")
    if not ploidy:
        raise ValueError(
            "`ploidy` parameter must be set when not present as dataset dimension."
        )
    if ploidy != 2:
        raise NotImplementedError(
            "HWE test only implemented for diploid genotypes")

    alleles = alleles or ds.dims.get("alleles")
    if not alleles:
        raise ValueError(
            "`alleles` parameter must be set when not present as dataset dimension."
        )
    if alleles != 2:
        raise NotImplementedError(
            "HWE test only implemented for biallelic genotypes")

    # Use precomputed genotype counts if provided
    if genotype_counts is not None:
        variables.validate(ds,
                           {genotype_counts: variables.genotype_counts_spec})
        obs = list(da.asarray(ds[genotype_counts]).T)
    # Otherwise compute genotype counts from calls
    else:
        ds = count_genotypes(ds, dim="samples")
        obs = [
            da.asarray(ds[v]) for v in
            ["variant_n_het", "variant_n_hom_ref", "variant_n_hom_alt"]
        ]
    p = da.map_blocks(hardy_weinberg_p_value_vec_jit, *obs)
    new_ds = create_dataset({variables.variant_hwe_p_value: ("variants", p)})
    return conditional_merge_datasets(ds, new_ds, merge)