Ejemplo n.º 1
0
    def transform(self, X, **kwargs):
        """Apply transforms to the data, and transform with the final estimator

        Parameters
        ----------
        X : xarray.DataArray
            Data to transform on. Must fulfill input requirements of first step
            of the model or pipeline.
        feature_dim : str, optional
            Name of feature dimension.
        **transform_params : dict of string -> object
            Parameters to the ``transform`` called at the end of all
            transformations in the pipeline.

        Returns
        -------
        y_trans : xarray.DataArray
        """

        kws = {'feature_dim': DEFAULT_FEATURE_DIM}
        kws.update(kwargs)

        X = self._to_feature_x(X, feature_dim=kws['feature_dim'])

        if X.chunks:
            return xr.map_blocks(_transform_wrapper, X, args=[self._models], kwargs=kws)
        else:
            return _transform_wrapper(X, self._models, **kws)
Ejemplo n.º 2
0
def compute_global_btt_quantile_complete_ensemble(sub_ds, name, q,
                                                  nr_time_steps,
                                                  sto_cache_size):
    chunk_dict = {"prob": 5}

    sub_ds = sub_ds.chunk(chunk_dict)

    fake_data = np.zeros((len(sub_ds.prob), len(sub_ds.time)))  #

    fake_array = xr.DataArray(data=fake_data, dims=['prob', "time"])

    fake_coords = {"prob": sub_ds.prob.data, "time": sub_ds.time.data}

    fake_ds = xr.Dataset(data_vars={
        name: fake_array
    }, coords=fake_coords).chunk(chunk_dict)

    ds_res = xr.map_blocks(func_chunk,
                           sub_ds,
                           args=(compute_global_btt_quantile, ),
                           kwargs={
                               "name": name,
                               "q": q,
                               "nr_time_steps": nr_time_steps,
                               "time_step_in_days":
                               params["time_step_in_days"],
                               "nr_sites": None,
                               "sto_cache_size": sto_cache_size
                           },
                           template=fake_ds)

    return ds_res
Ejemplo n.º 3
0
    def predict(self, X, **kwargs):
        """Apply transforms to the data, and predict with the final estimator

        Parameters
        ----------
        X : xarray.DataArray
            Data to predict on. Must fulfill input requirements of first step
            of the model or pipeline.
        feature_dim : str, optional
            Name of feature dimension.
        **predict_params : dict of string -> object
            Parameters to the ``predict`` called at the end of all
            transformations in the pipeline. Note that while this may be
            used to return uncertainties from some models with return_std
            or return_cov, uncertainties that are generated by the
            transformations in the pipeline are not propagated to the
            final estimator.

        Returns
        -------
        y_pred : xarray.DataArray
        """

        kws = {'along_dim': self._dim, 'feature_dim': DEFAULT_FEATURE_DIM}
        kws.update(kwargs)

        X = self._to_feature_x(X, feature_dim=kws['feature_dim'])

        if X.chunks:
            return xr.map_blocks(_predict_wrapper, X, args=[self._models], kwargs=kws)
        else:
            return _predict_wrapper(X, self._models, **kws)
Ejemplo n.º 4
0
def smooth2d(da: Var,
             datastore: DataStore = None,
             chunks: dict = None,
             **kwargs) -> xr.DataArray:
    '''
    Return an xr.DataArray smoothed along two dimensions.
    Works with both chunked(dask) and unchunked(numpy) data.
    Metadata attrs are adjusted according to camps metadata conventions.
    '''

    if isinstance(da, camps.Variable):
        da = da(datastore=datastore, chunks=chunks, **kwargs)
    else:
        if chunks:
            da = da.camps.chunk(chunks)

    x = da.camps.x.name
    y = da.camps.y.name

    # rechunk so that multiple chunks don't span x and y dims
    if da.chunks is not None:
        da = da.chunk({x: -1, y: -1})

    dims = (x, y)
    kwargs['dims'] = dims  # kwargs are passed to smooth2d_block

    da = xr.map_blocks(smooth2d_block, da, kwargs=kwargs, template=da)

    da.attrs['smooth'] = 'smooth_9point'

    return da
Ejemplo n.º 5
0
def test_map_blocks_kwargs(obj):
    expected = xr.full_like(obj, fill_value=np.nan)
    with raise_if_dask_computes():
        actual = xr.map_blocks(xr.full_like,
                               obj,
                               kwargs=dict(fill_value=np.nan))
    assert_chunks_equal(expected.chunk(), actual)
    xr.testing.assert_identical(actual.compute(), expected.compute())
Ejemplo n.º 6
0
def mld_dsigma(SALT, TEMP, dsigma=0.03, rho_chunks={'nlat': 16, 'nlon': 16}):
    """
    Compute MLD based on ∆σ criterion. Uses xarray.map_blocks.
    
    Parameters
    ----------
    
    SALT : xarray.DataArray
      Salinity
    TEMP : xarray.DataArray
      Potential temperature
    dsigma : float, optional
      The value for ∆σ.
      
    Returns
    -------
    
    MLD : xarray.DataArray
      The MLD (m) defined as the point in the water column where
      density exceeds rho[0] + dsigma.      
    """

    # determine dimensionality
    dims_in = SALT.dims
    assert dims_in == TEMP.dims, 'dimension mismatch'
    assert 'z_t' in SALT.coords, 'z_t not found in SALT coords'

    # drop ancillary coordinates (this may not be necessary)
    SALT = SALT.reset_coords(drop=True)
    TEMP = TEMP.reset_coords(drop=True)

    # compute density
    rho = pop_tools.eos(SALT.chunk({'z_t': 10}),
                        TEMP.chunk({'z_t': 10}),
                        depth=SALT.z_t * 0.).compute()

    if 'nlat' in rho.dims:
        rho = rho.assign_coords({
            'nlat':
            xr.DataArray(np.arange(len(SALT.nlat)), dims=('nlat')),
            'nlon':
            xr.DataArray(np.arange(len(SALT.nlon)), dims=('nlon')),
        })
    rho = rho.chunk(rho_chunks).persist()

    # compute and return MLD
    template = rho.isel(z_t=0).drop('z_t')
    template.attrs['long_name'] = 'MLD'
    template.attrs['units'] = SALT.z_t.attrs['units']
    template.name = 'MLD'

    return xr.map_blocks(
        _interp_mld,
        rho,
        kwargs=dict(dsigma=dsigma),
        template=template,
    )
Ejemplo n.º 7
0
def test_map_blocks_object_method(obj):
    def func(obj):
        result = obj + obj.x + 5 * obj.y
        return result

    with raise_if_dask_computes():
        expected = xr.map_blocks(func, obj)
        actual = obj.map_blocks(func)

    assert_identical(expected.compute(), actual.compute())
Ejemplo n.º 8
0
def test_map_blocks(obj):
    def func(obj):
        result = obj + obj.x + 5 * obj.y
        return result

    with raise_if_dask_computes():
        actual = xr.map_blocks(func, obj)
    expected = func(obj)
    assert_chunks_equal(expected.chunk(), actual)
    xr.testing.assert_identical(actual.compute(), expected.compute())
Ejemplo n.º 9
0
def test_map_blocks_change_name(map_da):
    def change_name(obj):
        obj = obj.copy(deep=True)
        obj.name = "new"
        return obj

    expected = change_name(map_da)
    with raise_if_dask_computes():
        actual = xr.map_blocks(change_name, map_da)

    xr.testing.assert_identical(actual.compute(), expected.compute())
Ejemplo n.º 10
0
def test_map_blocks_add_attrs(obj):
    def add_attrs(obj):
        obj = obj.copy(deep=True)
        obj.attrs["new"] = "new"
        obj.cxy.attrs["new2"] = "new2"
        return obj

    expected = add_attrs(obj)
    with raise_if_dask_computes():
        actual = xr.map_blocks(add_attrs, obj)

    xr.testing.assert_identical(actual.compute(), expected.compute())
Ejemplo n.º 11
0
    def fit(self, X, *args, **kwargs):
        """Fit the model

        Fit all the transforms one after the other and transform the
        data, then fit the transformed data using the final estimator.

        Parameters
        ----------
        X : xarray.DataArray or xarray.Dataset
            Training data. Must fulfill input requirements of first step of
            the pipeline. If an xarray.Dataset is passed, it will be converted
            to an array using `to_array()`.
        y : xarray.DataArray, optional
            Training targets. Must fulfill label requirements for all steps
            of the pipeline.
        feature_dim : str, optional
            Name of feature dimension.
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the this model. If the
            model is a sklearn Pipeline, parameters can be passed to each
            step, where each parameter name is prefixed such that parameter
            ``p`` for step ``s`` has key ``s__p``.
        """
        kws = {'along_dim': self._dim, 'feature_dim': DEFAULT_FEATURE_DIM}
        kws.update(kwargs)

        assert len(args) <= 1
        args = list(args)
        args.append(self._model)

        X = self._to_feature_x(X, feature_dim=kws['feature_dim'])

        if X.chunks:
            reduce_dims = [self._dim, kws['feature_dim']]
            mask = _make_mask(X, reduce_dims)
            template = xr.full_like(mask, None, dtype=np.object)
            self._models = xr.map_blocks(_fit_wrapper, X, args=args, kwargs=kws, template=template)
        else:
            self._models = _fit_wrapper(X, *args, **kws)
Ejemplo n.º 12
0
def test_map_blocks_error(map_da, map_ds):
    def bad_func(darray):
        return (darray * darray.x + 5 * darray.y)[:1, :1]

    with raises_regex(ValueError, "Length of the.* has changed."):
        xr.map_blocks(bad_func, map_da).compute()

    def returns_numpy(darray):
        return (darray * darray.x + 5 * darray.y).values

    with raises_regex(TypeError, "Function must return an xarray DataArray"):
        xr.map_blocks(returns_numpy, map_da)

    with raises_regex(TypeError, "args must be"):
        xr.map_blocks(operator.add, map_da, args=10)

    with raises_regex(TypeError, "kwargs must be"):
        xr.map_blocks(operator.add, map_da, args=[10], kwargs=[20])

    def really_bad_func(darray):
        raise ValueError("couldn't do anything.")

    with raises_regex(Exception, "Cannot infer"):
        xr.map_blocks(really_bad_func, map_da)

    ds_copy = map_ds.copy()
    ds_copy["cxy"] = ds_copy.cxy.chunk({"y": 10})

    with raises_regex(ValueError, "inconsistent chunks"):
        xr.map_blocks(bad_func, ds_copy)

    with raises_regex(TypeError, "Cannot pass dask collections"):
        xr.map_blocks(bad_func, map_da, args=[map_da.chunk()])

    with raises_regex(TypeError, "Cannot pass dask collections"):
        xr.map_blocks(bad_func, map_da, kwargs=dict(a=map_da.chunk()))
    def generate_product(
        self,
        dc,
        path_prefix,
        aoi,
        output_projection,
        baseline_start_date,
        baseline_end_date,
        analysis_start_date,
        analysis_end_date,
        platform_base,
        platform_analysis,
        res,
        aoi_crs,
        **kwargs,
    ):

        ## Create datacube query

        dask_chunks = dict(time=10, x=500, y=500)

        query = create_base_query(aoi, res, output_projection, aoi_crs, dask_chunks)

        all_measurements = ["green", "red", "blue", "nir", "swir1", "swir2"]
        (
            baseline_product,
            baseline_measurement,
            baseline_water_product,
        ) = create_product_measurement(platform_base, all_measurements)
        (
            analysis_product,
            analysis_measurement,
            analysis_water_product,
        ) = create_product_measurement(platform_analysis, all_measurements)

        baseline_time_period = (baseline_start_date, baseline_end_date)
        analysis_time_period = (analysis_start_date, analysis_end_date)

        ## Create dask graph

        baseline_ds = dc.load(
            time=baseline_time_period,
            platform=platform_base,
            product=baseline_product,
            measurements=baseline_measurement,
            **query,
        )

        analysis_ds = dc.load(
            time=analysis_time_period,
            platform=platform_analysis,
            product=analysis_product,
            measurements=analysis_measurement,
            **query,
        )

        if is_dataset_empty(baseline_ds):
            raise Exception(
                "DataCube Load returned an empty Dataset."
                + "Please check load parameters for Baseline Dataset!"
            )

        if is_dataset_empty(analysis_ds):
            raise Exception(
                "DataCube Load returned an empty Dataset."
                + "Please check load parameters for Analysis Dataset!"
            )

        water_scenes_baseline = dc.load(
            product=baseline_water_product,
            measurements=["water_classification"],
            time=baseline_time_period,
            **query,
        )
        water_scenes_baseline = water_scenes_baseline.where(water_scenes_baseline >= 0)
        water_scenes_analysis = dc.load(
            product=analysis_water_product,
            measurements=["water_classification"],
            time=analysis_time_period,
            **query,
        )
        water_scenes_analysis = water_scenes_analysis.where(water_scenes_analysis >= 0)

        baseline_composite = geomedian(baseline_ds, baseline_product, all_measurements)
        analysis_composite = geomedian(analysis_ds, analysis_product, all_measurements)

        water_classes_base = water_scenes_baseline.where(water_scenes_baseline >= 0)
        water_classes_analysis = water_scenes_analysis.where(water_scenes_analysis >= 0)

        water_composite_base = water_classes_base.water_classification.mean(dim="time")
        water_composite_analysis = water_classes_analysis.water_classification.mean(
            dim="time"
        )

        baseline_composite = baseline_composite.rename(
            {"y": "latitude", "x": "longitude"}
        )
        water_composite_base = water_composite_base.rename(
            {"y": "latitude", "x": "longitude"}
        )
        analysis_composite = analysis_composite.rename(
            {"y": "latitude", "x": "longitude"}
        )
        water_composite_analysis = water_composite_analysis.rename(
            {"y": "latitude", "x": "longitude"}
        )

        # Spectral Parameter Anomaly

        parameter_baseline_composite = xr.map_blocks(
            frac_coverage_classify, baseline_composite, kwargs={"no_data": np.nan}
        )
        parameter_analysis_composite = xr.map_blocks(
            frac_coverage_classify, analysis_composite, kwargs={"no_data": np.nan}
        )

        frac_cov_baseline = parameter_baseline_composite.where(
            (water_composite_base <= 0.4) & (parameter_baseline_composite != -9999)
        )

        frac_cov_analysis = parameter_analysis_composite.where(
            (water_composite_analysis <= 0.4) & (parameter_analysis_composite != -9999)
        )
        parameter_anomaly = frac_cov_analysis - frac_cov_baseline

        ## Compute

        parameter_anomaly_output = parameter_anomaly.compute()

        ## Export products

        bs_output = parameter_anomaly_output.bs
        pv_output = parameter_anomaly_output.pv
        npv_output = parameter_anomaly_output.npv

        ## Write files

        result = []

        file_name = path.join(path_prefix, "land_change.tiff")
        import_export.export_xarray_to_geotiff(
            parameter_anomaly_output,
            file_name,
            crs=output_projection,
            x_coord="longitude",
            y_coord="latitude",
        )
        result.append(file_name)

        file_name = path.join(path_prefix, "bs_change.tiff")
        import_export.export_xarray_to_geotiff(
            bs_output,
            file_name,
            crs=output_projection,
            x_coord="longitude",
            y_coord="latitude",
        )
        result.append(file_name)

        file_name = path.join(path_prefix, "pv_change.tiff")
        import_export.export_xarray_to_geotiff(
            pv_output,
            file_name,
            crs=output_projection,
            x_coord="longitude",
            y_coord="latitude",
        )
        result.append(file_name)

        file_name = path.join(path_prefix, "npv_change.tiff")
        import_export.export_xarray_to_geotiff(
            npv_output,
            file_name,
            crs=output_projection,
            x_coord="longitude",
            y_coord="latitude",
        )
        result.append(file_name)

        return result
Ejemplo n.º 14
0
    the_mean = calc_mean(the_array)
    the_perturb = the_array - the_mean
    return the_perturb


#
# save the perturbation and mean in separate dictionaries
#
vars = ['TABS', 'W', 'TR01']
perturb_dict_keys = ['temp_prime', 'w_prime', 'tr_prime']
mean_dict_keys = ['temp_mean', 'w_mean', 'tr_mean']
perturb_dict = {}
key_pairs = zip(perturb_dict_keys, vars)
for key, a_var in key_pairs:
    perturb_dict[key] = \
      xr.map_blocks(calc_perturb, zarr_slim_ds[a_var].chunk((tlim,zlim,ylim,xlim)))

mean_dict = {}
key_pairs = zip(mean_dict_keys, vars)
for a_var in vars:
    mean_dict[a_var] = \
      xr.map_blocks(calc_mean, zarr_slim_ds[a_var].chunk((tlim,zlim,ylim,xlim)))
#
# now make a new dataset with these variables
#

new_ds = xr.Dataset(perturb_dict)
#
# and add the remaining means
#
for key, value in mean_dict.items():
Ejemplo n.º 15
0
def main():

    #client = Client(n_workers=2, threads_per_worker=1, memory_limit="3GB")
    #client

    # +
    data_folder = "/home/hmetzler/Desktop/CARDAMOM/"  # local
    #data_folder = "/home/data/CARDAMOM/"  # matagorda

    filestem = "cardamom_for_holger_10_ensembles"
    #filestem = "cardamom_for_holger"

    #chunk_dict = {"ens": 20}
    chunk_dict = {"ens": 2}

    #filestem = "cardamom_for_holger"
    #chunk_dict = {"ens": 100}
    ds = xr.open_dataset(data_folder + filestem + ".nc")  #.isel(
    #    ens=slice(None, 6),
    #    time=slice(None, 5)
    #)
    ds = ds.chunk(chunk_dict)
    ds

    # -

    # there is no multi-dimensional 'groupby' in xarray data structures
    def nested_groupby_apply(dataset, groupby, apply_fn, **kwargs):
        if len(groupby) == 1:
            return dataset.groupby(groupby[0]).apply(apply_fn, **kwargs)
        else:
            return dataset.groupby(groupby[0]).apply(nested_groupby_apply,
                                                     groupby=groupby[1:],
                                                     apply_fn=apply_fn,
                                                     **kwargs)

    comp_dict = {'zlib': True, 'complevel': 9}

    # +
    # %%time

    # compute in parallel the model runs and save them to ds_mrs in netcdf format

    small_template = xr.Dataset(
        data_vars={
            'x':
            xr.DataArray(data=np.ndarray(dtype=float,
                                         shape=(len(ds.ens.data), )),
                         dims=['ens'])
        }).chunk(chunk_dict)

    def func(single_site_ds):
        res = CARDAMOMlib.compute_pwc_mr_fd_ds(single_site_ds)
        return res

    def func_chunk(chunk_ds):
        print('\nChunk start:', chunk_ds.ens.data[0], '\n')
        res = nested_groupby_apply(chunk_ds, ['ens', 'lat', 'lon'], func)

        filename = filestem + "_{:03d}-{:03d}.nc".format(
            res.ens.data[0], res.ens.data[-1])
        encoding = {var: comp_dict for var in res.data_vars}
        res.to_netcdf(filename, encoding=encoding)
        print(res)
        del res

        return xr.Dataset(data_vars={
            'x':
            xr.DataArray(data=np.zeros((chunk_dict['ens'], )), dims=['ens'])
        })

    _ = xr.map_blocks(func_chunk, ds, template=small_template).compute()
    # -

    ds.close()
    del ds
Ejemplo n.º 16
0
def process_fractional_cover(
    dc,
    product,
    query_x_from,
    query_x_to,
    query_y_from,
    query_y_to,
    time_from,
    time_to,
    output_crs,
    query_crs="EPSG:4326",
    dask_time_chunk_size="10",
    dask_x_chunk_size="600",
    dask_y_chunk_size="600",
    **kwargs,
):
    nodata = -9999
    time = (time_from, time_to)

    data_bands = ["red", "green", "blue", "nir", "swir1", "swir2"]

    # Product here is a geomedian product
    if product.startswith("ls"):
        resolution = (-30, 30)
        water_product = product[:3] + "_water_classification"
    else:
        resolution = (-10, 10)
        # TODO: Change when S2 WOFS ready
        water_product = None
        return None

    query = {}

    query["output_crs"] = output_crs
    query["resolution"] = resolution
    query["dask_chunks"] = {
        "time": int(dask_time_chunk_size),
        "x": int(dask_x_chunk_size),
        "y": int(dask_y_chunk_size),
    }

    if query_crs != "EPSG:4326":
        query["crs"] = query_crs

    query["x"] = (float(query_x_from), float(query_x_to))
    query["y"] = (float(query_y_from), float(query_y_to))

    water_scenes = dc.load(product=water_product,
                           measurements=["water_classification"],
                           time=time,
                           **query)
    water_scenes = water_scenes.where(water_scenes >= 0)

    water_composite_mean = water_scenes.water_classification.mean(dim="time")

    land_composite = dc.load(product=product,
                             measurements=data_bands,
                             time=time,
                             **query)

    if len(land_composite.dims) == 0 or len(land_composite.data_vars) == 0:
        return None

    # Fractional Cover Classification

    frac_classes = xr.map_blocks(frac_coverage_classify,
                                 land_composite,
                                 kwargs={"no_data": nodata})

    # Mask to remove clounds, cloud shadow, and water.
    frac_cov_masked = frac_classes.where((frac_classes != nodata)
                                         & (water_composite_mean <= 0.4))

    ## Compute

    fractional_cover = frac_cov_masked.compute()

    return fractional_cover

def func(single_site_ds):
    res = CARDAMOMlib.compute_pwc_mr_fd_ds(single_site_ds)
    return res


def func_chunk(chunk_ds):
    print('\nChunk start:', chunk_ds.ens.data[0], '\n')
    res = nested_groupby_apply(chunk_ds, ['ens', 'lat', 'lon'], func)

    filename = filestem + "_{:03d}-{:03d}.nc".format(res.ens.data[0],
                                                     res.ens.data[-1])
    encoding = {var: comp_dict for var in res.data_vars}
    res.to_netcdf(filename, encoding=encoding)
    print(res)
    del res
    gc.collect()

    return xr.Dataset(data_vars={
        'x':
        xr.DataArray(data=np.zeros((chunk_dict['ens'], )), dims=['ens'])
    })


_ = xr.map_blocks(func_chunk, ds, template=small_template).compute()
# -

ds.close()
del ds
Ejemplo n.º 18
0
def test_map_blocks_to_array(map_ds):
    with raise_if_dask_computes():
        actual = xr.map_blocks(lambda x: x.to_array(), map_ds)

    # to_array does not preserve name, so cannot use assert_identical
    assert_equal(actual.compute(), map_ds.to_array().compute())
    def generate_product(
        self,
        dc,
        path_prefix,
        aoi,
        output_projection,
        start_date,
        end_date,
        platform,
        res,
        aoi_crs,
        **kwargs,
    ):

        ## Create datacube query

        dask_chunks = dict(time=10, x=600, y=600)

        query = create_base_query(aoi, res, output_projection, aoi_crs, dask_chunks)

        all_measurements = ["green", "red", "blue", "nir", "swir1", "swir2"]
        product, measurement, water_product = create_product_measurement(
            platform, all_measurements
        )

        time = (start_date, end_date)

        ## Create dask graph

        ds = dc.load(
            time=time,
            platform=platform,
            product=product,
            measurements=measurement,
            **query,
        )

        if is_dataset_empty(ds):
            raise Exception(
                "DataCube Load returned an empty Dataset."
                + "Please check load parameters for Baseline Dataset!"
            )

        water_scenes = dc.load(
            product=water_product,
            measurements=["water_classification"],
            time=time,
            **query,
        )
        water_scenes = water_scenes.where(water_scenes >= 0)

        water_composite_mean = water_scenes.water_classification.mean(dim="time")
        water_composite_mean = water_composite_mean.rename(
            {"x": "longitude", "y": "latitude"}
        )

        land_composite = geomedian(ds, product, all_measurements)
        land_composite = land_composite.rename({"x": "longitude", "y": "latitude"})

        # Fractional Cover Classification

        frac_classes = xr.map_blocks(
            frac_coverage_classify, land_composite, kwargs={"no_data": np.nan}
        )

        # Mask to remove clounds, cloud shadow, and water.
        frac_cov_masked = frac_classes.where(
            (frac_classes != np.nan) & (water_composite_mean <= 0.4)
        )

        ## Compute

        fractional_cover_output = frac_cov_masked.compute()

        ## Write file

        file_name = path.join(path_prefix, "fractional_cover.tiff")
        import_export.export_xarray_to_geotiff(
            fractional_cover_output,
            file_name,
            crs=output_projection,
            x_coord="longitude",
            y_coord="latitude",
        )

        return [file_name]
Ejemplo n.º 20
0
# %%time

# compute in parallel the model runs and save them to ds_mrs in netcdf format


def func(single_site_ds):
    res = CARDAMOMlib.compute_pwc_mr_fd_ds(single_site_ds)
    return res


def func_chunk(chunk_ds):
    res = nested_groupby_apply(chunk_ds, ['ens', 'lat', 'lon'], func)
    return res


ds_mrs = xr.map_blocks(func_chunk, ds, template=ds_mr_template).compute()
#ds_mrs = xr.map_blocks(func_chunk, ds, template=ds).compute()

print(ds_mrs)
comp_dict = {'zlib': True, 'complevel': 9}
encoding = {var: comp_dict for var in ds_mrs.data_vars}
ds_mrs.to_netcdf(filestem + "_pwc_mrs_fd" + ".nc", encoding=encoding)
ds_mrs.close()

# +
# %%time

# compute Delta 14C values on entire grid (ens x lat x lon) (400 x 2 x 2)


def func(sub_ds):
Ejemplo n.º 21
0
    #        flush=True
    #    )
    #    write_to_logfile(
    #        'chunk finished,',
    #        "lat:", chunk_ds.lat[0].data,
    #        "lon:", chunk_ds.lon[0].data,
    #        "prob:", chunk_ds.prob[0].data
    #    )

    return res_ds


# -

fake_ds = make_fake_ds(ds_sub).chunk(chunk_dict)
ds_pwc_mr_fd = xr.map_blocks(func_chunk, ds_sub, template=fake_ds)
fake_ds

ds_pwc_mr_fd

# +
# %%time

c = ds_sub.chunks
nr_chunks = np.prod([len(val) for val in c.values()])
nr_singles = len(ds_sub.lat) * len(ds_sub.lon) * len(ds_sub.prob)
write_to_logfile(
    'starting:',
    #    nr_chunks, "chunks, ",
    nr_singles,
    "singles")
Ejemplo n.º 22
0
def test_map_blocks_convert_args_to_list(obj):
    expected = obj + 10
    with raise_if_dask_computes():
        actual = xr.map_blocks(operator.add, obj, [10])
    assert_chunks_equal(expected.chunk(), actual)
    xr.testing.assert_identical(actual.compute(), expected.compute())
Ejemplo n.º 23
0
 def reduce(self, xx: xr.Dataset) -> xr.Dataset:
     template = xr.Dataset({m: xx.nbart_blue for m in self.measurements})
     return xr.map_blocks(nmask_pmod.summarise, xx, template=template)
    'lat': ds_sub.lat.data,
    'lon': ds_sub.lon.data,
    'prob': ds_sub.prob.data
}

fake_ds = xr.Dataset(data_vars={
    'abs_err': fake_array,
    'rel_err': fake_array
},
                     coords=fake_coords).chunk(chunk_dict)
fake_ds

# +
# %%time

ds_data_consistency = xr.map_blocks(func_chunk, ds_sub, template=fake_ds)
# -

comp_dict = {'zlib': True, 'complevel': 9}
ds_data_consistency.to_netcdf(data_folder + filestem + output_folder +
                              "data_consistency.nc",
                              compression=comp_dict,
                              compute=True)

ds.close()
del ds
ds_sub.close()
del ds_sub
ds_data_consistency.close()
del ds_data_consistency
Ejemplo n.º 25
0
def bootstrap_func(compute_index_func: Callable, **kwargs) -> xr.DataArray:
    """Bootstrap the computation of percentile-based exceedance indices.

    Indices measuring exceedance over percentile-based threshold may contain artificial discontinuities at the
    beginning and end of the reference period used for calculating the percentile. A bootstrap resampling
    procedure can reduce those discontinuities by iteratively replacing each the year the indice is computed on from
    the percentile estimate, and replacing it with another year within the reference period.

    Parameters
    ----------
    compute_index_func : Callable
      Indice function.
    kwargs : dict
      Arguments to `func`.

    Returns
    -------
    xr.DataArray
      The result of func with bootstrapping.

    References
    ----------
    Zhang, X., Hegerl, G., Zwiers, F. W., & Kenyon, J. (2005). Avoiding Inhomogeneity in Percentile-Based Indices of
    Temperature Extremes, Journal of Climate, 18(11), 1641-1651, https://doi.org/10.1175/JCLI3366.1

    Notes
    -----
    This function is meant to be used by the `percentile_bootstrap` decorator.
    The parameters of the percentile calculation (percentile, window, reference_period)
    are stored in the attributes of the percentile DataArray.
    The bootstrap algorithm implemented here does the following::

        For each temporal grouping in the calculation of the indice
            If the group `g_t` is in the reference period
                For every other group `g_s` in the reference period
                    Replace group `g_t` by `g_s`
                    Compute percentile on resampled time series
                    Compute indice function using percentile
                Average output from indice function over all resampled time series
            Else compute indice function using original percentile

    """
    # Identify the input and the percentile arrays from the bound arguments
    per_key = None
    for name, val in kwargs.items():
        if isinstance(val, DataArray):
            if "percentile_doy" in val.attrs.get("history", ""):
                per_key = name
            else:
                da_key = name
    # Extract the DataArray inputs from the arguments
    da: DataArray = kwargs.pop(da_key)
    per_da: Optional[DataArray] = kwargs.pop(per_key, None)
    if per_da is None:
        # per may be empty on non doy percentiles
        raise KeyError(
            "`bootstrap` can only be used with percentiles computed using `percentile_doy`"
        )
    # Boundary years of reference period
    clim = per_da.attrs["climatology_bounds"]
    if xclim.core.utils.uses_dask(da) and len(
            da.chunks[da.get_axis_num("time")]) > 1:
        warnings.warn(
            "The input data is chunked on time dimension and must be fully re-chunked to"
            " run percentile bootstrapping."
            " Beware, this operation can significantly increase the number of tasks dask"
            " has to handle.",
            stacklevel=2,
        )
        chunking = {d: "auto" for d in da.dims}
        chunking["time"] = -1  # no chunking on time to use map_block
        da = da.chunk(chunking)
    # overlap of studied `da` and reference period used to compute percentile
    overlap_da = da.sel(time=slice(*clim))
    if len(overlap_da.time) == len(da.time):
        raise KeyError(
            "`bootstrap` is unnecessary when all years are overlapping between reference "
            "(percentiles period) and studied (index period) periods")
    if len(overlap_da) == 0:
        raise KeyError(
            "`bootstrap` is unnecessary when no year overlap between reference "
            "(percentiles period) and studied (index period) periods.")
    pdoy_args = dict(
        window=per_da.attrs["window"],
        alpha=per_da.attrs["alpha"],
        beta=per_da.attrs["beta"],
        per=per_da.percentiles.data[()],
    )
    bfreq = _get_bootstrap_freq(kwargs["freq"])
    # Group input array in years, with an offset matching freq
    overlap_years_groups = overlap_da.resample(time=bfreq).groups
    da_years_groups = da.resample(time=bfreq).groups
    per_template = per_da.copy(deep=True)
    acc = []
    # Compute bootstrapped index on each year of overlapping years
    for year_key, year_slice in da_years_groups.items():
        kw = {da_key: da.isel(time=year_slice), **kwargs}
        if _get_year_label(year_key) in overlap_da.get_index("time").year:
            # If the group year is in both reference and studied periods, run the bootstrap
            bda = build_bootstrap_year_da(overlap_da, overlap_years_groups,
                                          year_key)
            if BOOTSTRAP_DIM not in per_template.dims:
                per_template = per_template.expand_dims(
                    {BOOTSTRAP_DIM: np.arange(len(bda._bootstrap))})
                if xclim.core.utils.uses_dask(bda):
                    chunking = {
                        d: bda.chunks[bda.get_axis_num(d)]
                        for d in set(bda.dims).intersection(
                            set(per_template.dims))
                    }
                    per_template = per_template.chunk(chunking)
            per = xr.map_blocks(
                percentile_doy.
                __wrapped__,  # strip history update from percentile_doy
                obj=bda,
                kwargs={
                    **pdoy_args, "copy": False
                },
                template=per_template,
            )
            if "percentiles" not in per_da.dims:
                per = per.squeeze("percentiles")
            kw[per_key] = per
            value = compute_index_func(**kw).mean(dim=BOOTSTRAP_DIM,
                                                  keep_attrs=True)
        else:
            # Otherwise, run the normal computation using the original percentile
            kw[per_key] = per_da
            value = compute_index_func(**kw)
        acc.append(value)
    result = xr.concat(acc, dim="time")
    result.attrs["units"] = value.attrs["units"]
    return result
    def generate_product(
        self,
        dc,
        path_prefix,
        aoi,
        output_projection,
        start_date,
        end_date,
        platform,
        res,
        aoi_crs,
        **kwargs,
    ):

        ## Create datacube query

        dask_chunks = dict(time=10, x=1000, y=1000)

        query = create_base_query(aoi, res, output_projection, aoi_crs,
                                  dask_chunks)

        all_measurements = ["green", "red", "blue", "nir", "swir1", "swir2"]
        product, measurement, water_product = create_product_measurement(
            platform, all_measurements)

        time = (start_date, end_date)

        ## Create dask graph

        ds = dc.load(
            time=time,
            platform=platform,
            product=product,
            measurements=measurement,
            **query,
        )

        if is_dataset_empty(ds):
            raise Exception(
                "DataCube Load returned an empty Dataset." +
                "Please check load parameters for Baseline Dataset!")

        water_scenes = dc.load(
            product=water_product,
            measurements=["water_classification"],
            time=time,
            **query,
        )

        # Set land to no_data
        water_dataset = water_scenes.where(water_scenes > 0)

        good_quality = mask_good_quality(ds, product)
        ds_clear = ds.where(good_quality)
        ds_clear_land = ds_clear.where(water_dataset.water_classification > 0)
        tsm_dataset = xr.map_blocks(tsm, ds_clear_land)

        mean_tsm = tsm_dataset.mean(dim=["time"])
        max_tsm = tsm_dataset.max(dim=["time"])
        min_tsm = tsm_dataset.min(dim=["time"])

        ## Compute

        mean_tsm, max_tsm, min_tsm = dask.compute(mean_tsm, max_tsm, min_tsm)

        ## Write files

        result = []

        file_name = path.join(path_prefix, "mean_tsm.tiff")
        import_export.export_xarray_to_geotiff(
            mean_tsm,
            file_name,
            crs=output_projection,
            x_coord="x",
            y_coord="y",
        )
        result.append(file_name)

        file_name = path.join(path_prefix, "min_tsm.tiff")
        import_export.export_xarray_to_geotiff(
            min_tsm,
            file_name,
            crs=output_projection,
            x_coord="x",
            y_coord="y",
        )
        result.append(file_name)

        file_name = path.join(path_prefix, "max_tsm.tiff")
        import_export.export_xarray_to_geotiff(
            max_tsm,
            file_name,
            crs=output_projection,
            x_coord="x",
            y_coord="y",
        )
        result.append(file_name)

        return result
Ejemplo n.º 27
0
def test_map_blocks_ds_transformations(func, map_ds):
    with raise_if_dask_computes():
        actual = xr.map_blocks(func, map_ds)

    assert_identical(actual.compute(), func(map_ds).compute())
Ejemplo n.º 28
0
def to_stations(da: Var,
                *,
                stations: pd.DataFrame,
                datastore: DataStore = None,
                chunks: dict = None,
                **kwargs) -> xr.DataArray:

    if isinstance(da, camps.Variable):
        da = da(datastore=datastore, chunks=chunks, **kwargs)
    elif isinstance(da, xr.DataArray):
        if chunks:
            da = da.camps.chunk(chunks)

    # Determine x and y
    # Use Projected crs as common crs for grid and station

    try:
        x = da.camps.projx.name
        y = da.camps.projy.name
    except KeyError:
        # projected coordinates do not exist
        if da.camps.grid_mapping:
            # try to make them from grid_mapping and lat/lons
            da = da.metpy.assign_crs(da.camps.grid_mapping)
            da = da.metpy.assign_y_x()
            da = da.drop('metpy_crs')
            x = da.camps.projx.name
            y = da.camps.projy.name
        else:
            # exception for mercator data without grid_mapping
            lat = da.camps.latitude
            if lat.ndim == 1:
                y = lat.name
            lon = da.camps.longitude
            if lon.ndim == 1:
                x = lon.name

            # ensure longitude expressed as degrees east from prime meridian with -179 and 180 bounds
            lon_attrs = lon.attrs
            da[lon.name] = xr.where(lon > 180, lon - 360, lon)
            da[lon.name].attrs = lon_attrs

            # Add the lat lon grid mapping since it didn't have one
            # Latitude and longitude on the WGS 1984 datum
            lat_lon_wgs84 = {
                'grid_mapping_name': "latitude_longitude",
                'longitude_of_prime_meridian': 0.0,
                'semi_major_axis': 6378137.0,
                'inverse_flattening': 298.257223563
            }
            gm = xr.DataArray()
            gm.attrs.update(lat_lon_wgs84)
            da = da.assign_coords({'lat_lon_wgs84': gm})
            da.attrs['grid_mapping'] = 'lat_lon_wgs84'

    pyproj_crs = CFProjection(da.camps.grid_mapping).to_pyproj()
    stations['x'], stations['y'] = Proj(pyproj_crs)(stations.lon.values,
                                                    stations.lat.values)

    # rechunk so that multiple chunks don't span x and y dims
    if da.chunks is not None:
        da = da.chunk({x: -1, y: -1})
        da = da.unify_chunks()

    # load x,y data
    da[x].load()
    da[y].load()

    # make horizonontal space 1-D by stacking x and y
    stacked = da.stack(xy=(x, y))
    gridxy = np.column_stack((stacked[x].data, stacked[y].data))

    stationxy = np.column_stack((stations.x, stations.y))

    tree = cKDTree(gridxy)  # fast nearest neighbor search algorith
    dist_ix = tree.query(
        stationxy)  # find distance to nearest and index of nearest

    # make a grid polygon
    from shapely.geometry import Polygon, MultiPoint, Point
    unit_y = xr.DataArray(np.ones(da[y].shape), dims=da[y].dims)
    unit_x = xr.DataArray(np.ones(da[x].shape), dims=da[x].dims)
    edge_x = edge(da[x] * unit_y)  # broadcast constant x along the y dim
    edge_y = edge(unit_x * da[y])  # broadcast constant y along the x dim

    xy = zip(edge_x, edge_y)
    grid_polygon = Polygon(xy)

    # make station points
    xy = zip(stations.x.values, stations.y.values)
    station_points = MultiPoint(list(xy))

    # determine which stations lie outside grid domain (polygon)
    stations['point_str'] = pd.Series([str(p) for p in station_points])
    stations['ix'] = stations.index
    points_outside_grid = station_points - station_points.intersection(
        grid_polygon)
    if not points_outside_grid.is_empty:
        if isinstance(points_outside_grid, Point):
            points_outside_grid = [points_outside_grid]
        ix_stations_outside_grid = stations.set_index('point_str').loc[[
            str(p) for p in points_outside_grid
        ]].ix
    else:
        ix_stations_outside_grid = list()  # let be empty list

    print(len(ix_stations_outside_grid))

    def nearest_worker(da: xr.DataArray, *, x, y, ix, ix_nan) -> xr.DataArray:
        da = da.stack(station=(x,
                               y))  # squash the horizontal space dims into one

        da = da.isel(station=ix)
        da = da.drop_vars('station')  # remove station coord
        da.loc[{
            'station': ix_nan
        }] = np.nan  # use integer index location to set stations outside grid to missing

        return da

    # make template for expressing the change in shape in map_blocks
    template = da.copy()
    # reshape action
    template = template.stack(
        station=(x, y))  # combine the lat/lon dims into one dim called station
    template = template.isel(
        station=[0] *
        len(stations))  # select only the first; this removes the dim station
    template = template.drop(
        'station'
    )  # drop the multiindex lat/lon coord associated with 'station' from the 0th grid point

    mb_kwargs = dict(x=x, y=y, ix=dist_ix[1], ix_nan=ix_stations_outside_grid)
    da = xr.map_blocks(nearest_worker, da, kwargs=mb_kwargs, template=template)

    # remove any metadata that may be leftover from the grid
    da = da.drop_vars([x, y], errors='ignore')

    # configure station metadata
    # prep station coord with numeric index called 'station'
    stations = stations.reset_index()
    stations.index.set_names('station', inplace=True)
    # assign the new coords
    da = da.assign_coords({'platform_id': stations.call})
    da.platform_id.attrs['standard_name'] = 'platform_id'

    # assign the new coords with numeric index
    da = da.assign_coords({'lat': stations.lat})
    da.lat.attrs['standard_name'] = 'latitude'
    da.lat.attrs['units'] = 'degrees_north'
    da = da.assign_coords({'lon': stations.lon})
    da.lon.attrs['standard_name'] = 'longitude'
    da.lon.attrs['units'] = 'degrees_east'
    # drop the numeric index;
    da = da.reset_index('station', drop=True)

    return da
Ejemplo n.º 29
0

# +
chunk_dict = {"lat": 1, "lon": 1, "prob": 1}
ds_sub = ds.isel(lat=slice(0, None, 1),
                 lon=slice(0, None, 1),
                 prob=slice(0, 5, 1)).chunk(chunk_dict)

#ds_sub = ds.chunk(chunk_dict)
ds_sub

# +
# %%time

#ds_data_consistency = xr.map_blocks(func_chunk, ds_sub, template=fake_ds)
xr.map_blocks(func_chunk, ds_sub, template=ds_sub).compute()
# -

comp_dict = {'zlib': True, 'complevel': 9}
ds_data_consistency.to_netcdf(data_folder + filestem + output_folder +
                              "data_consistency.nc",
                              compression=comp_dict,
                              compute=True)

ds.close()
del ds
ds_sub.close()
del ds_sub
ds_data_consistency.close()
del ds_data_consistency
Ejemplo n.º 30
0
def interp_to_isosurfaces(da: Var,
                          *,
                          level_data: Var,
                          isovalues: Union[Number, List[Number]],
                          datastore: DataStore = None,
                          chunks: dict = None,
                          **kwargs) -> xr.DataArray:

    if isinstance(da, camps.Variable):
        da = da(datastore=datastore, chunks=chunks, **kwargs)
    elif isinstance(da, xr.DataArray):
        if chunks:
            da = da.camps.chunk(chunks)

    if isinstance(level_data, camps.Variable):
        level_data = level_data(datastore=datastore, chunks=chunks, **kwargs)
    elif isinstance(level_data, xr.DataArray):
        if chunks:
            level_data = level_data.camps.chunk(chunks)

    # rechunk so that multiple chunks don't span z dim
    if da.chunks is not None:
        da = da.camps.chunk({'z': -1})

    if da.coords.keys() != level_data.coords.keys():
        raise ValueError('data and level variable coords do not match')

    # detach and re-attach non-NUG coords
    z = da.camps.z.name
    saved_coords = dict()
    for coord in da.coords:
        if len(da[coord].dims) > 1:
            if z in da[coord].dims:
                raise ValueError(
                    'non-NUG coords spanning z axis are not allowed')
            saved_coords[coord] = da[coord]
            da = da.drop(coord)
            level_data = level_data.drop(coord)

    # prep for computation via map_blocks
    kwargs['isovalues'] = isovalues

    # inputs prepped as dataset as map_blocks does not take multiple chunked arrays
    ds = xr.Dataset()
    ds['variable'] = da
    ds['level_variable'] = level_data

    # prep output template (the output will have this meta structure)
    template = interp_to_isosurface_meta_template(ds, isovalues)

    # horizontal dims will be either x,y grid or stations, for now don't worry about handling stations
    # rename input z axis dim name to output z axis dim name (determined by meta template; only z is touched)
    x = da.camps.x.name
    y = da.camps.y.name
    z_in = da.camps.z.name
    z_final = template.camps.z.name
    ds = ds.rename({z_in: z_final})
    kwargs['space_dims'] = [z_final, x, y]

    # perform work on each chunked block individually with the interp_to_isosurface_worker
    #da = xr.map_blocks(interp_to_isosurface_worker, ds, kwargs=kwargs, template=template)
    da = xr.map_blocks(interp_to_isosurface_worker,
                       ds,
                       kwargs=kwargs,
                       template=template)

    # re-assign coords that spanned more than one dimension
    da = da.assign_coords(saved_coords)

    return da