Example #1
0
def add_uniform_time_weights(ds):
    """Append uniform time weights to a Dataset.

    All DataArrays with a time coordinate require a time weights coordinate.
    For Datasets read in without a time bounds coordinate or explicit
    time weights built in, aospy adds uniform time weights at each point
    in the time coordinate.

    Parameters
    ----------
    ds : Dataset
        Input data

    Returns
    -------
    Dataset
    """
    time = ds[TIME_STR]
    unit_interval = time.attrs['units'].split('since')[0].strip()
    time_weights = xr.ones_like(time)
    time_weights.attrs['units'] = unit_interval
    del time_weights.attrs['calendar']
    ds[TIME_WEIGHTS_STR] = time_weights
    return ds
Example #2
0
def _test_data(grid_label="gn", z_axis=True):
    xt = np.arange(4) + 1
    yt = np.arange(5) + 1
    zt = np.arange(6) + 1

    x = xr.DataArray(xt, coords=[("x", xt)])
    y = xr.DataArray(yt, coords=[("y", yt)])
    lev = xr.DataArray(zt, coords=[("lev", zt)])

    # Need to add a tracer here to get the tracer dimsuffix
    coords = [("x", x.data), ("y", y.data)]
    data = np.random.rand(len(xt), len(yt))
    dims = ["x", "y"]

    if z_axis:
        coords.append(("lev", lev.data))
        data = np.random.rand(len(x), len(y), len(lev))
        dims = ["x", "y", "lev"]

    tr = xr.DataArray(
        data,
        dims=dims,
        coords=coords,
    )

    lon_raw = xr.DataArray(xt, coords=[("x", xt)])
    lat_raw = xr.DataArray(yt, coords=[("y", yt)])
    lon = lon_raw * xr.ones_like(lat_raw)
    lat = xr.ones_like(lon_raw) * lat_raw

    lon_bounds_e = lon + 0.5
    lon_bounds_w = lon - 0.5 + (np.random.rand(*lon.shape) * 0.05)
    lat_bounds_n = lat + 0.5 + (np.random.rand(*lon.shape) * 0.05)
    lat_bounds_s = lat - 0.5 + (np.random.rand(*lon.shape) * 0.05)

    lon_bounds = xr.concat(
        [_add_small_rand(lon_bounds_w),
         _add_small_rand(lon_bounds_w)],
        dim="bnds")
    lat_bounds = xr.concat(
        [_add_small_rand(lat_bounds_s),
         _add_small_rand(lat_bounds_n)],
        dim="bnds")

    if z_axis:
        lev_bounds = xr.concat(
            [_add_small_rand(lev - 0.5),
             _add_small_rand(lev + 0.5)],
            dim="bnds")

    lon_verticies = xr.concat(
        [
            _add_small_rand(lon_bounds_e),
            _add_small_rand(lon_bounds_e),
            _add_small_rand(lon_bounds_w),
            _add_small_rand(lon_bounds_w),
        ],
        dim="vertex",
    )
    lat_verticies = xr.concat(
        [
            _add_small_rand(lat_bounds_s),
            _add_small_rand(lat_bounds_n),
            _add_small_rand(lat_bounds_n),
            _add_small_rand(lat_bounds_s),
        ],
        dim="vertex",
    )

    ds = xr.Dataset({"base": tr})

    dataset_coords = dict(
        lon=lon,
        lat=lat,
        lon_bounds=lon_bounds,
        lat_bounds=lat_bounds,
        lon_verticies=lon_verticies,
        lat_verticies=lat_verticies,
    )

    if z_axis:
        dataset_coords["lev_bounds"] = lev_bounds

    ds = ds.assign_coords(dataset_coords)
    ds.attrs["source_id"] = "test_model"
    ds.attrs["grid_label"] = grid_label
    ds.attrs["variable_id"] = "base"
    return ds
Example #3
0
    def run(self):
        # for `num_cells` operation the field shouldn't be given because the
        # number of cells is just computed from the mask
        inputs = self.input()

        da_objects = inputs["objects"].open()
        if self.op != "num_cells":
            da_field = inputs["field"].open().squeeze()
        else:
            if self.field_name is not None:
                raise Exception(
                    f"Field name should not be given when computing `{self.op}`"
                    f" (`{self.field_name}` was provided)"
                )
            da_field = None

        object_ids = np.unique(da_objects.chunk(None).values)
        if object_ids[0] == 0:
            object_ids = object_ids[1:]

        kwargs = dict(
            objects=da_objects.name,
            object_ids=object_ids,
            op=self.op,
        )

        if self.op != "num_cells":
            kwargs["scalar"] = da_field.name

        da_objects_ = da_objects.sel(zt=self.z).compute()
        if self.op != "num_cells":
            da_ = da_field.sel(zt=self.z).compute()
        else:
            da_ = xr.ones_like(da_objects_)

        # to avoid the confusion where the "area" is requested but what in fact
        # is returned is the "number of cells" (which is dimensionless) we
        # enforce here that the "area" cannot be calculated, but instead
        # "num_cells" can be requested and we use the `area` dask-image op
        # (which returns the number of cells)
        op = kwargs["op"]
        if op == "area":
            raise Exception(
                "Shouldn't ask for `area` as it asctually the number of cells"
            )
        elif op == "num_cells":
            op = "area"

        fn = getattr(dask_image.ndmeasure, op)
        v = fn(da_, label_image=da_objects_, index=object_ids).compute()
        da = xr.DataArray(data=v, dims=["object_id"], coords=dict(object_id=object_ids))
        if self.op != "num_cells":
            da.name = "{}__{}".format(da_.name, kwargs["op"])
            da.attrs["units"] = da_.units
            da.attrs["long_name"] = "{} of {} per object".format(
                kwargs["op"],
                da_.long_name,
            )
        else:
            da.name = "num_cells"
            da.attrs["units"] = "1"
            da.attrs["long_name"] = "num_cells per object"

        da.coords["zt"] = self.z
        da.coords["time"] = da_objects_.time

        da.to_netcdf(self.output().fn)
Example #4
0
    def test_ds_to_np(
        self,
        tmp_path,
        normalize,
        to_tensor,
        experiment,
        surrounding_pixels,
        predict_delta,
    ):
        x_pred, _, _ = _make_dataset(size=(5, 5), const=True)
        x_coeff1, _, _ = _make_dataset(size=(5, 5), variable_name="precip")
        x_coeff2, _, _ = _make_dataset(size=(5, 5),
                                       variable_name="soil_moisture")
        x_coeff3, _, _ = _make_dataset(size=(5, 5), variable_name="temp")

        x = xr.merge([x_pred, x_coeff1, x_coeff2, x_coeff3])
        y = x_pred.isel(time=[0])

        data_dir = tmp_path / experiment / "1980_1"
        if not data_dir.exists():
            data_dir.mkdir(parents=True, exist_ok=True)

        x.to_netcdf(data_dir / "x.nc")
        y.to_netcdf(data_dir / "y.nc")

        norm_dict = {}
        for var in x.data_vars:
            norm_dict[var] = {
                "mean":
                float(x[var].mean(dim=["lat", "lon", "time"],
                                  skipna=True).values),
                # we clip the std because since constant=True, the std=0 for VHI,
                # giving NaNs which mess the tests up
                "std":
                float(
                    np.clip(
                        a=x[var].std(dim=["lat", "lon", "time"],
                                     skipna=True).values,
                        a_min=1,
                        a_max=None,
                    )),
            }

        # build static data
        static1 = x.mean(dim="time").rename(
            {v: f"{v}_pixel_mean"
             for v in x.data_vars})
        ones = xr.ones_like(x.mean(dim="time"))[[v for v in x.data_vars][0]]
        static2 = x.mean(dim=["lat", "lon", "time"]).rename(
            {v: f"{v}_global_mean"
             for v in x.data_vars})
        static2 = static2 * ones
        static_ds = xr.auto_combine([static1, static2])

        class MockLoader:
            def __init__(self):
                self.batch_file_size = None
                self.mode = None
                self.shuffle = None
                self.clear_nans = None
                self.data_files = []
                self.normalizing_dict = norm_dict if normalize else None
                self.to_tensor = None
                self.experiment = experiment
                self.surrounding_pixels = surrounding_pixels
                self.predict_delta = predict_delta
                self.ignore_vars = ["precip"]
                self.monthly_aggs = False
                self.device = torch.device("cpu")
                self.incl_yearly_aggs = False
                self.static = static_ds
                self.spatial_mask = None
                self.static_normalizing_dict = None
                self.normalize_y = normalize

        base_iterator = _BaseIter(MockLoader())

        arrays = base_iterator.ds_folder_to_np(data_dir, to_tensor=to_tensor)
        x_train_data, y_np, latlons = (arrays.x, arrays.y, arrays.latlons)

        # ----------------------
        # Test the static data
        # ----------------------
        # check first 3 features are CONSTANT (global means)
        assert all([
            all(arrays.x.static[:, i][1:] == arrays.x.static[:, i][:-1])
            for i in range(4)
        ])
        if not predict_delta:
            # check second 3 features vary (pixel means)
            assert all([
                all(arrays.x.static[:, i][1:] != arrays.x.static[:, i][:-1])
                for i in range(4, 6)
            ]), (f"static data: \n[,4]\n: {arrays.x.static[:, 4][1:]}\n[,5]"
                 f"\n: {arrays.x.static[:, 5][1:]}")

        n_samples = 25 if surrounding_pixels is None else 9
        assert (arrays.x.static.shape[0] == n_samples
                ), f"Expect {n_samples} samples because ..."

        assert (
            arrays.x.static.shape[-1] == 6
        ), "Expect 6 static features because ignore 'precip' variables in the static data"

        # ----------------------
        # Test the TrainData
        # ----------------------
        assert isinstance(x_train_data, TrainData)
        if not to_tensor:
            assert isinstance(y_np, np.ndarray)

        expected_features = 3 if surrounding_pixels is None else 3 * 9
        assert x_train_data.historical.shape[-1] == expected_features, (
            f"There should be 4 historical variables "
            f"(the final dimension): {x_train_data.historical.shape}")

        if experiment == "nowcast":
            expected_shape = (25, 2) if surrounding_pixels is None else (9,
                                                                         2 * 9)
            assert x_train_data.current.shape == expected_shape, (
                f"Expecting multiple vars in the current timestep. "
                f"Expect: (25, 5) Got: {x_train_data.current.shape}")

        expected_latlons = 25 if surrounding_pixels is None else 9

        assert latlons.shape == (expected_latlons, 2), (
            "The shape of "
            "latlons should not change"
            f"Got: {latlons.shape}. Expecting: (25, 2)")
        assert x_train_data.latlons.shape == (expected_latlons, 2), (
            "The shape of "
            "latlons should not change"
            f"Got: {latlons.shape}. Expecting: (25, 2)")

        if normalize and (experiment == "nowcast") and (not to_tensor):
            assert x_train_data.current.max() < 6, (
                f"The current data should be"
                f" normalized. Currently: {x_train_data.current.flatten()}")

        if to_tensor:
            assert (type(x_train_data.historical)
                    == torch.Tensor) and (type(y_np) == torch.Tensor)
        else:
            assert (type(x_train_data.historical)
                    == np.ndarray) and (type(y_np) == np.ndarray)

        if (not normalize) and (experiment == "nowcast") and (not to_tensor):
            assert x_train_data.historical.shape[
                0] == x_train_data.current.shape[0], (
                    "The 0th dimension (latlons) should be equal in the "
                    f"historical ({x_train_data.historical.shape[0]}) and "
                    f"current ({x_train_data.current.shape[0]}) arrays.")

            expected = (x[["soil_moisture", "temp"]].sel(time=y.time).stack(
                dims=["lat", "lon"]).to_array().values.T[:, 0, :])
            got = x_train_data.current

            if surrounding_pixels is None:
                assert expected.shape == got.shape, (
                    "should have stacked latlon"
                    " vars as the first dimension in the current array.")

                assert (expected == got).all(), (
                    ""
                    "Expected to find the target timesetep of `precip` values"
                    "(the non-target variable for the target timestep: "
                    f"({pd.to_datetime(y.time.values).strftime('%Y-%m-%d')[0]})."
                    f"Expected: {expected[:5]}. \nGot: {got[:5]}")

        for idx in range(latlons.shape[0]):
            lat, lon = latlons[idx, 0], latlons[idx, 1]
            for time in range(x_train_data.historical.shape[1]):
                target = x.isel(time=time).sel(lat=lat).sel(lon=lon).VHI.values

                if (not normalize) and (not to_tensor):
                    assert target == x_train_data.historical[idx, time, 0], (
                        "Got different x values for time idx:"
                        f"{time}, lat: {lat}, lon: {lon} Expected {target}, "
                        f"got {x_train_data.historical[idx, time, 0]}")

        if ((not normalize) and (experiment == "nowcast")
                and (surrounding_pixels is None)):
            # test that we are getting the right `current` data
            relevant_features = ["soil_moisture", "temp"]
            target_time = y.time
            expected = (
                x[relevant_features]  # all vars except target_var and the ignored var
                .sel(time=target_time)  # select the target_time
                .stack(dims=[
                    "lat", "lon"
                ])  # stack lat,lon so shape = (lat*lon, time, dims)
                .to_array().values[:, 0, :].
                T  # extract numpy array, transpose and drop dim
            )

            assert np.all(x_train_data.current == expected), (
                f"Expected to "
                "find the target_time data for the non target variables")

        if x_train_data.yearly_aggs is not None:
            # n_variables should be 3 because `ignoring` precip
            assert x_train_data.yearly_aggs.shape[1] == 3

        if (not normalize) and (not to_tensor):
            mean_temp = x_coeff3.temp.mean(dim=["time", "lat", "lon"]).values
            if x_train_data.yearly_aggs is not None:
                assert (mean_temp == x_train_data.yearly_aggs).any()

        if predict_delta:
            assert (y_np == 0).all(
            ), "The derivatives should be 0 for a constant input."
            assert (base_iterator.predict_delta
                    ), "should have set model_ derivative to True"
Example #5
0
def arct_connect(ds, varName, all_faces):
    arc_cap = 6
    Nx_ac_nrot = []
    Ny_ac_nrot = []
    Nx_ac_rot = []
    Ny_ac_rot = []
    ARCT = []
    arc_faces = []
    metrics = ["dxC", "dyC", "dxG", "dyG"]

    if arc_cap in all_faces:
        for k in all_faces:
            if k == 2:
                fac = 1
                arc_faces.append(k)
                _varName = varName
                DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                dims = Dims(DIMS[::-1])
                dtr = list(dims)[::-1]
                dtr[-1], dtr[-2] = dtr[-2], dtr[-1]
                mask2 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                # TODO: Eval where, define argument outside
                mask2 = mask2.where(
                    _np.logical_and(
                        ds[dims.X] < ds[dims.Y],
                        ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y],
                    ))
                x0, xf = 0, int(len(ds[dims.Y]) / 2)  # TODO: CHECK here!
                y0, yf = 0, int(len(ds[dims.X]))
                xslice = slice(x0, xf)
                yslice = slice(y0, yf)
                Nx_ac_nrot.append(0)
                Ny_ac_nrot.append(len(ds[dims.Y][y0:yf]))
                da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                sort_arg = {"variables": dims.Y, "ascending": False}
                mask_arg = {dims.X: xslice, dims.Y: yslice}
                if len(dims.X) + len(dims.Y) == 4:
                    if len(dims.Y) == 1 and _varName not in metrics:
                        fac = -1
                    if "mates" in list(ds[_varName].attrs):
                        _varName = ds[_varName].attrs["mates"]
                    _DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                    dims = Dims(_DIMS[::-1])
                    dtr = list(dims)[::-1]
                    dtr[-1], dtr[-2] = dtr[-2], dtr[-1]
                    mask2 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                    mask2 = mask2.where(
                        _np.logical_and(
                            ds[dims.X] < ds[dims.Y],
                            ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y],
                        ))
                    da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                    sort_arg = {"variables": dims.Y, "ascending": False}
                    mask_arg = {dims.X: xslice, dims.Y: yslice}
                arct = fac * ds[_varName].isel(**da_arg)
                arct = arct.sortby(**sort_arg)
                Mask = mask2.isel(**mask_arg)
                Mask = Mask.sortby(**sort_arg)
                arct = (arct * Mask).transpose(*dtr)
                ARCT.append(arct)

            elif k == 5:
                fac = 1
                arc_faces.append(k)
                _varName = varName
                DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                dims = Dims(DIMS[::-1])
                mask5 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                mask5 = mask5.where(
                    _np.logical_and(
                        ds[dims.X] > ds[dims.Y],
                        ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y],
                    ))
                x0, xf = 0, int(len(ds[dims.X]))
                y0, yf = 0, int(len(ds[dims.Y]) / 2)
                xslice = slice(x0, xf)
                yslice = slice(y0, yf)
                Nx_ac_nrot.append(0)
                Ny_ac_nrot.append(len(ds[dims.X][y0:yf]))
                da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                mask_arg = {dims.X: xslice, dims.Y: yslice}
                arct = ds[_varName].isel(**da_arg)
                Mask = mask5.isel(**mask_arg)
                arct = arct * Mask
                ARCT.append(arct)

            elif k == 7:
                fac = 1
                arc_faces.append(k)
                _varName = varName
                DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                dims = Dims(DIMS[::-1])
                dtr = list(dims)[::-1]
                dtr[-1], dtr[-2] = dtr[-2], dtr[-1]
                mask7 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                mask7 = mask7.where(
                    _np.logical_and(
                        ds[dims.X] > ds[dims.Y],
                        ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y],
                    ))
                x0, xf = int(len(ds[dims.Y]) / 2), int(len(ds[dims.Y]))
                y0, yf = 0, int(len(ds[dims.X]))
                xslice = slice(x0, xf)
                yslice = slice(y0, yf)
                Nx_ac_rot.append(len(ds[dims.Y][x0:xf]))
                Ny_ac_rot.append(0)
                if len(dims.X) + len(dims.Y) == 4:
                    if len(dims.X) == 1 and _varName not in metrics:
                        fac = -1
                    if "mates" in list(ds[varName].attrs):
                        _varName = ds[varName].attrs["mates"]
                    DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                    dims = Dims(DIMS[::-1])
                    dtr = list(dims)[::-1]
                    dtr[-1], dtr[-2] = dtr[-2], dtr[-1]
                    mask7 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                    mask7 = mask7.where(
                        _np.logical_and(
                            ds[dims.X] > ds[dims.Y],
                            ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y],
                        ))
                da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                mask_arg = {dims.X: xslice, dims.Y: yslice}
                arct = fac * ds[_varName].isel(**da_arg)
                Mask = mask7.isel(**mask_arg)
                arct = (arct * Mask).transpose(*dtr)
                ARCT.append(arct)

            elif k == 10:
                fac = 1
                _varName = varName
                DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                dims = Dims(DIMS[::-1])
                arc_faces.append(k)
                mask10 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                mask10 = mask10.where(
                    _np.logical_and(
                        ds[dims.X] < ds[dims.Y],
                        ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y],
                    ))
                x0, xf = 0, int(len(ds[dims.X]))
                y0, yf = int(len(ds[dims.Y]) / 2), int(len(ds[dims.Y]))
                xslice = slice(x0, xf)
                yslice = slice(y0, yf)
                Nx_ac_rot.append(0)
                Ny_ac_rot.append(len(ds[dims.Y][y0:yf]))
                if len(dims.X) + len(dims.Y) == 4:
                    if _varName not in metrics:
                        fac = -1
                da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                sort_arg = {"variables": [dims.X], "ascending": False}
                mask_arg = {dims.X: xslice, dims.Y: yslice}
                arct = fac * ds[_varName].isel(**da_arg)
                arct = arct.sortby(**sort_arg)
                Mask = mask10.isel(**mask_arg)
                Mask = Mask.sortby(**sort_arg)
                arct = arct * Mask
                ARCT.append(arct)

    return arc_faces, Nx_ac_nrot, Ny_ac_nrot, Nx_ac_rot, Ny_ac_rot, ARCT
df.name = name
df = df.reset_index().rename(columns={0: "year", "level_1": "month"})

# create datetime index
df["time"] = df.apply(lambda x: pd.to_datetime(f"{x.year}-{x.month}"), axis=1)
df = df.set_index("time").drop(columns=["year", "month"])

# replace missing data
df = df.astype({name: float}).replace(-99.99, np.nan)

# resample to month end (same as other data)
df = df.resample("M").first()

# -----------------
# save to xarray / .nc
# -----------------
data_dir = Path("data")
vci = xr.open_dataset(
    data_dir / "interim/boku_ndvi_1000_preprocessed/data_kenya.nc")["boku_VCI"]

# for each MONTH TIMESTEP multiply by the nino value
nino_xr = xr.ones_like(vci)
nino_ts = df.loc[nino_xr.time.values]
nino_xr = nino_xr * pd.DataFrame.to_xarray(nino_ts)

if not (data_dir / "analysis/sst").exists():
    (data_dir / "analysis/sst").mkdir(parents=True, exist_ok=True)

# save to netcdf
nino_xr.to_netcdf(data_dir / f"analysis/sst/data_{name}.nc")
Example #7
0
def simulate_and_regress(
    pop,
    no_policy_growth_rate,
    p_effects,
    p_lags,
    p_start_interval,
    n_days,
    tsteps_per_day,
    n_samples,
    LHS_vars,
    reg_lag_days,
    gamma_to_test,
    min_cases,
    sigma_to_test=[np.nan],
    measurement_noise_on=False,
    measurement_noise_sd=0,
    beta_noise_on=False,
    beta_noise_sd=0,
    sigma_noise_on=False,
    sigma_noise_sd=0,
    gamma_noise_on=False,
    gamma_noise_sd=0,
    kind="SEIR",
    E0=1,
    I0=0,
    R0=0,
    random_end=False,
    ordered_policies=True,
    save_dir=None,
):
    """Full wrapper to run Monte Carlo simulations of a disease outbreak using SEIR or
    SIR dynamics for a number of parameter sets.
    
    Parameters
    ----------
    pop : int
        Population to use for these sets of simulation
    no_policy_growth_rate : float
        Continuous (asymptotic if SEIR) daily growth rate of infections given no policy, 
        prior to adding noise.
    p_effects : list of float
        Magnitude of the effect of each policy being applied in the simulations to the
        (asymptotic) continuous growth rate
    p_lags : list of list of float
        Lagged policy effect in *dynamic simulation*. This occurs if you assume behavior
        takes some time to respond to a policy, and thus the affect of the policy on 
        $\beta$ is delayed. The outer dimension of this list refers to the different 
        policies in the simulation. The inner is a list of floats between 0 and 1, with
        each element corresponding to one day after the policy. For isntance, the value
        ``[[.5,.75],[]]`` means that there are 2 policies. The first has .5 of it's total
        effect (as indicated in `p_effects`) occuring the day the policy is implemented,
        .75 the day after, and then reaches its full effect on $\beta$ on the third day.
        The second policy has an immediate, full effect on $\beta$
    p_start_interval : list of int
        ``[start_date, end_date]`` providing the bounds within which each policy is 
        allowed to begin. The start date is applied uniformly randomly within this 
        interval for each Monte Carlo draw.
    n_days : int
        How many days to run in the simulation
    tsteps_per_day : int
    n_samples : int
        Number of MC draws per parameter set being simulated. In each draw, there will 
        be different noise added to parameters and measurements, different starting
        days for policies, and/or different end days for the data used in regression
    LHS_vars : list of str
        The left hand side variables you want to use to estimate regressions of impact
        of policy on outbreak growth rate. These are in "SEIR" terminology, i.e. ``I``
        is active infectious cases. Sums are accomplished by concatenating letters (e.g.
        ``IR`` is active infectious cases + recovered/dead cases).
    reg_lag_days : list of int, optional
        Lags to include in the regression model for each policy (in days). [0] means no 
        lags.
    [gamma,sigma]_to_test : list of float
        Rate parameters to test. Each one will get `n_samples` MC draws. These are 
        defined as continous daily rates
    min_cases : int
        Minimum cumulative cases that are needed before the beginning of the timeseries
        that gets used in regression. However, when this is violated, the regression 
        simply begins 2 days before the first policy. If this happens in many MC draws,
        this will bias the estimate of no-policy growth rate more than we would see in
        the regressions on actual data, because it will be further from its asymptotic
        steady-state growth rate.
    [measurement,beta,gamma,sigma]_noise_on : "exponential", "normal", or False
        What type of noise to apply to each parameter during the dynamic simulation 
        and/or to the daily measurements of log-difference (just prior to regression). 
        False means no noise
    [measurement,beta,gamma,sigma]_noise_sd : float
        Standard deviation of noise to apply. Only used if 
        ``[varname]_noise_on=="normal"``
    kind : "SIR" or "SEIR"
    E0,I0,R0 : int, optional
        Initial conditions (in number of people). Default is ``I0=1`` and all others 0.
    random_end : bool, optional
        Whether to allow each regression to end at the end of the data sample (False) 
        or to randomly cut off the end of the timeseries at some day between the day 
        after the start of the last policy and the end of the sample. Default False.
    ordered_policies : bool, optional
        Whether you want the first policy to always be enacted before the second, which 
        is enacted before the third, etc. Default is yes.
    save_dir : str or :class:`pathlib.Path`
        The directory to save results
        
    Returns
    -------
    daily_ds : :class:`xarray.Dataset`
        A dataset with all relevant information from each MC draw, both dynamically 
        simulated states and regression outputs.
    """

    attrs = dict(
        E0=E0,
        I0=I0,
        R0=R0,
        pop=pop,
        min_cases=min_cases,
        measurement_noise_on=str(measurement_noise_on),
        beta_noise_on=str(beta_noise_on),
        gamma_noise_on=str(gamma_noise_on),
        measurement_noise_sd=measurement_noise_sd,
        beta_noise_sd=beta_noise_sd,
        gamma_noise_sd=gamma_noise_sd,
        no_policy_growth_rate=no_policy_growth_rate,
        tsteps_per_day=tsteps_per_day,
        p_effects=p_effects,
    )

    if save_dir is not None:
        save_dir = Path(save_dir)

    E0 = E0 / pop
    I0 = I0 / pop
    R0 = R0 / pop

    ## setup
    if kind == "SEIR":
        attrs["sigma_noise_on"] = str(sigma_noise_on)
        attrs["sigma_noise_sd"] = sigma_noise_sd
        sim_engine = run_SEIR
        get_beta = get_beta_SEIR
        ics = [E0, I0, R0]
    elif kind == "SIR":
        sigma_to_test = [np.nan]
        sigma_noise_on = False
        sim_engine = run_SIR
        get_beta = get_beta_SIR
        ics = [I0, R0]
        LHS_vars = [l for l in LHS_vars if "E" not in l]

    # get time vector
    ttotal = n_days * tsteps_per_day + 1
    t = np.linspace(0, 1, ttotal) * n_days

    # store policy info
    policies = xr.Dataset(
        coords={
            "policy": [f"p{i+1}" for i in range(len(p_effects))],
            "time": ["start", "end"],
            "lag_num": range(len(p_lags[0])),
        },
        data_vars={
            "effect": (("policy", ), p_effects),
            "lag": (("policy", "lag_num"), p_lags),
            "interval": (("time", ), p_start_interval),
        },
    )

    # initialize results array
    estimates_ds = init_reg_ds(
        n_samples,
        LHS_vars,
        policies.policy.values,
        gamma=gamma_to_test,
        sigma=sigma_to_test,
    )

    # get policy effects
    policy_dummies, random_end_da = init_policy_dummies(
        policies,
        n_samples,
        t,
        seed=0,
        random_end=random_end,
        ordered_policies=ordered_policies,
    )
    policies = xr.merge((policies, policy_dummies, random_end_da))
    policy_effect_timeseries = (policies.policy_timeseries *
                                policies.effect).sum("policy")
    n_samp_valid = len(policies.sample)

    # adjust rate params to correct timestep
    estimates_ds = adjust_timescales_from_daily(estimates_ds, t[1] - t[0])
    beta_noise_sd = beta_noise_sd / np.sqrt(tsteps_per_day)
    gamma_noise_sd = gamma_noise_sd / np.sqrt(tsteps_per_day)
    sigma_noise_sd = sigma_noise_sd / np.sqrt(tsteps_per_day)

    # get stochastic params
    estimates_ds = get_stochastic_discrete_params(
        estimates_ds,
        no_policy_growth_rate,
        policy_effect_timeseries,
        t,
        beta_noise_on,
        beta_noise_sd,
        kind=kind,
        gamma_noise_on=gamma_noise_on,
        gamma_noise_sd=gamma_noise_sd,
        sigma_noise_on=sigma_noise_on,
        sigma_noise_sd=sigma_noise_sd,
    )

    # run simulation
    estimates_ds = sim_engine(*ics, estimates_ds)

    # add on other potentially observable quantities
    estimates_ds["IR"] = estimates_ds["R"] + estimates_ds["I"]
    if kind == "SEIR":
        estimates_ds["EI"] = estimates_ds["E"] + estimates_ds["I"]
        estimates_ds["EIR"] = estimates_ds["EI"] + estimates_ds["R"]

    # get minimum S for each simulation
    # at end and when the last policy turns on
    estimates_ds["S_min"] = estimates_ds.S.isel(t=-1)
    p3_on = (policies.policy_timeseries > 0).argmax(dim="t").max(dim="policy")
    estimates_ds["S_min_p3"] = estimates_ds.S.isel(t=p3_on)

    # blend in policy dataset
    estimates_ds = estimates_ds.merge(policies)

    # convert to daily observations
    daily_ds = adjust_timescales_to_daily(estimates_ds)

    # prep regression LHS vars (logdiff)
    daily_ds["logdiff"] = (np.log(daily_ds[daily_ds.LHS.values]).diff(
        dim="t", n=1, label="lower").pad(t=(0, 1)).to_array(dim="LHS"))
    if "sigma" not in daily_ds.logdiff.dims:
        daily_ds["logdiff"] = daily_ds.logdiff.expand_dims("sigma")

    # add noise
    daily_ds = add_obs_noise(
        daily_ds,
        measurement_noise_on=measurement_noise_on,
        measurement_noise_sd=measurement_noise_sd,
    )

    ## run regressions
    estimates = np.empty(
        (
            len(daily_ds.gamma),
            len(daily_ds.sigma),
            len(daily_ds.sample),
            len(daily_ds.LHS),
            len(daily_ds.policy) * len(reg_lag_days) + 1,
        ),
        dtype=np.float32,
    )
    mses = np.empty(
        (
            len(daily_ds.gamma),
            len(daily_ds.sigma),
            len(daily_ds.sample),
            len(daily_ds.LHS),
        ),
        dtype=np.float32,
    )
    estimates.fill(np.nan)
    mses.fill(np.nan)

    # add on lags
    RHS_old = (daily_ds.policy_timeseries > 0).astype(int)
    RHS_ds = xr.ones_like(RHS_old.isel(policy=0))
    RHS_ds["policy"] = "Intercept"
    for l in reg_lag_days:
        lag_vars = RHS_old.shift(t=l, fill_value=0)
        lag_vars["policy"] = [f"{x}_lag{l}" for x in RHS_old.policy.values]
        RHS_ds = xr.concat((RHS_ds, lag_vars), dim="policy")

    # Apply min cum_cases threshold used in regressions
    valid_reg = daily_ds.IR >= min_cases / pop
    if "sigma" not in valid_reg.dims:
        valid_reg = valid_reg.expand_dims("sigma")
        valid_reg["sigma"] = [np.nan]

    # only run regression on planned start day if we have at least one "no-policy" day after that
    # otherwise, start regression 2 days before first policy
    any_pol = (RHS_old > 0).max(dim="policy")
    first_pol = any_pol.argmax(dim="t")
    no_pol_on_regday0 = first_pol > valid_reg.argmax(dim="t")

    backup = any_pol.shift({"t": -2}).astype(bool)
    backup = backup | backup.isnull()

    # find random last day to end regression, starting with 1 day after last policy
    # is implemented
    if random_end:
        last_pol = (daily_ds.policy_timeseries.sum(dim="policy") == 3).argmax(
            dim="t")
        last_reg_day = ((
            (daily_ds.dims["t"] -
             (last_pol + 1)) * daily_ds.random_end).round().astype(int) +
                        last_pol + 1)
    else:
        last_reg_day = daily_ds.dims["t"]
    daily_ds["random_end"] = last_reg_day

    # loop through regressions
    for cx, case_var in enumerate(daily_ds.LHS.values):
        case_ds = daily_ds.logdiff_stoch.sel(LHS=case_var)
        for gx, g in enumerate(daily_ds.gamma.values):
            g_ds = case_ds.sel(gamma=g)
            for sx, s in enumerate(daily_ds.sigma.values):
                s_ds = g_ds.sel(sigma=s)
                for samp in daily_ds.sample.values:
                    if no_pol_on_regday0.isel(sample=samp, gamma=gx, sigma=sx):
                        this_valid = valid_reg.isel(sample=samp,
                                                    gamma=gx,
                                                    sigma=sx)
                    else:
                        this_valid = backup.isel(sample=samp)
                    if random_end:
                        this_valid = (this_valid) & (RHS_ds.t <=
                                                     last_reg_day[samp])
                    LHS = s_ds.isel(sample=samp)[this_valid].values
                    RHS = add_constant(
                        RHS_ds.isel(sample=samp)[{
                            "t": this_valid
                        }].values)
                    res = OLS(LHS, RHS, missing="drop").fit()

                    estimates[gx, sx, samp, cx] = res.params
                    mses[gx, sx, samp, cx] = res.mse_resid
    coords = OrderedDict(
        gamma=daily_ds.gamma,
        sigma=daily_ds.sigma,
        sample=daily_ds.sample,
        LHS=daily_ds.LHS,
    )
    rmse_ds = xr.DataArray(np.sqrt(mses), coords=coords, dims=coords.keys())

    coords["policy"] = RHS_ds.policy
    e = xr.DataArray(estimates, coords=coords,
                     dims=coords.keys()).to_dataset("policy")

    coeffs = []
    for p in daily_ds.policy.values:
        keys = [i for i in e.variables.keys() if f"{p}_" in i]
        coeffs.append(e[keys].rename(
            {k: int(k.split("_")[-1][3:])
             for k in keys}).to_array(dim="reg_lag"))
    coef_ds = xr.concat(coeffs, dim="policy")
    coef_ds.name = "coefficient"
    daily_ds = daily_ds.drop("coefficient").merge(coef_ds)
    daily_ds["Intercept"] = e["Intercept"]
    daily_ds["rmse"] = rmse_ds

    # add model params
    daily_ds.attrs = attrs

    if save_dir is not None:
        save_dir.mkdir(exist_ok=True, parents=True)
        fname = f"pop_{int(pop)}_lag_{'-'.join([str(s) for s in reg_lag_days])}.nc"
        daily_ds.to_netcdf(save_dir / fname)

    return daily_ds
from python.aux.floodmodels import LocalModel, FlowModel

# In[45]:

# ### Spatial feature selection
# In contrast to the transport model we have no background info which gridpoints influence the predictand the most, so we use dimensionality reduction approach and/or let the model decide which gridpoints are most relevant.
#
# The only hard constraint for the LocalModel is the influence radius of 1.5 degrees latitude/longitude, about 170 km and that the gridpoints have to lie within the catchment basin of the point.

# In[46]:

from python.aux.utils_flowmodel import get_mask_of_basin

# In[47]:

map = xr.ones_like(glofas['dis'].isel(time=0).drop('time'))
mask_catchment = get_mask_of_basin(map, 'Danube')

if debug:
    plt.imshow(mask_catchment.astype(int))
    plt.title('Catchment basin of the Danube river')
    plt.show()

# In[48]:


def select_riverpoints(dis):
    return (dis > 10)  #.drop('time')


# In[49]:
Example #9
0
def _wrap_butterworth(data,
                      coord,
                      freq,
                      kind,
                      cycles_per="s",
                      order=2,
                      debug=False,
                      gappy=None,
                      **kwargs):
    """
    Inputs
    ------

    data : xr.DataArray
    coord : coordinate along which to filter
    freq : "frequencies" for filtering
    cycles_per: optional
        Units for frequency
    order : optional
        Butterworth filter order
    kwargs : dict, optional
        passed down to gappy_filter

    Outputs
    -------

    filtered : xr.DataArray
    """

    # if len(data.dims) > 1 and coord is None:
    #     raise ValueError('Specify coordinate along which to filter')
    # else:
    #     coord = data.coords[0]

    if _is_datetime_like(data[coord]):
        dx = _process_time(data[coord], cycles_per)
    else:
        dx = np.diff(data[coord][0:2].values)

    b, a = signal.butter(order, freq * dx / (1 / 2), btype=kind)

    data = data.copy().transpose(..., coord)

    if debug:
        import dcpy.ts
        import matplotlib.pyplot as plt

        f, ax = plt.subplots(2, 1, constrained_layout=True)
        data.plot(x=coord, ax=ax[0])
        dcpy.ts.PlotSpectrum(data, cycles_per=cycles_per, ax=ax[1])

    if data.chunks:
        chunks = dict(zip(data.dims, data.chunks))
        if len(chunks[coord]) > 1:
            use_overlap = True
        else:
            use_overlap = False
    else:
        use_overlap = False

    if gappy is not None:
        warnings.warn(
            UserWarning,
            "'gappy' kwarg is now deprecated and completely ignored.")

    num_discard = kwargs.pop("num_discard", "auto")
    kwargs.setdefault("method", "gust")
    if kwargs["method"] == "gust" and "irlen" not in kwargs:
        kwargs["irlen"] = estimate_impulse_response_len(b, a)
    kwargs.update(b=b, a=a, axis=-1)

    valid = data.notnull()
    if np.issubdtype(data.dtype, np.dtype(complex)):
        filled = data.real.ffill(coord).bfill(
            coord) + 1j * data.imag.ffill(coord).bfill(coord)
    else:
        filled = data.ffill(coord).bfill(coord)

    # I need distance from nearest NaN
    index = np.arange(data.sizes[coord])
    arange = xr.ones_like(data.reset_coords(drop=True), dtype=int) * index
    invalid_arange = (
        arange.where(~valid).interpolate_na(
            coord, "nearest",
            fill_value="extrapolate").fillna(-1)  # when all points are valid
    )
    distance = np.abs(arange - invalid_arange).where(valid)

    if not use_overlap:
        filtered = xr.apply_ufunc(
            filter_,
            filled,
            input_core_dims=[[coord]],
            output_core_dims=[[coord]],
            dask="parallelized",
            output_dtypes=[data.dtype],
            kwargs=kwargs,
        )

    else:
        import dask

        if not isinstance(data, xr.DataArray):
            raise ValueError("map_overlap implemented only for DataArrays.")
        irlen = estimate_impulse_response_len(b, a)
        axis = data.get_axis_num(coord)
        overlap = np.round(2 * irlen).astype(int)
        min_chunksize = 3 * overlap
        actual_chunksize = data.data.chunksize[axis]

        if actual_chunksize < min_chunksize:
            raise ValueError(
                f"Chunksize along {coord} = {actual_chunksize} < {min_chunksize}. Please rechunk"
            )

        depth = dict(zip(range(data.ndim), [0] * data.ndim))
        depth[data.ndim - 1] = overlap
        filtered = data.copy(data=dask.array.map_overlap(
            filled.data,
            filter_,
            depth=depth,
            boundary="none",
            meta=filled.data._meta,
            **kwargs,
        ))

    # take out the beginning and end if necessary
    mask = xr.DataArray(
        np.ones((filtered.sizes[coord], ), dtype=bool),
        dims=[coord],
        name=coord,
        coords={coord: filtered[coord]},
    )
    num_discard = _get_num_discard(kwargs, num_discard)
    if num_discard > 0:
        mask[:num_discard] = False
        mask[-num_discard:] = False

    filtered = filtered.where((distance >= num_discard) & mask)

    if debug:
        filtered.plot(x=coord, ax=ax[0])
        ylim = ax[1].get_ylim()
        dcpy.ts.PlotSpectrum(filtered, cycles_per=cycles_per, ax=ax[1])
        ax[1].set_ylim(ylim)
        for ff in np.array(freq, ndmin=1):
            plt.axvline(ff)

    return filtered
Example #10
0
def init_z_level_vertical_coord(config, ds):
    """
    Create a z-level vertical coordinate based on the config options in the
    ``vertical_grid`` section and the ``bottomDepth`` and ``ssh`` variables of
    the mesh data set.

    The following new variables will be added to the data set:

      * ``minLevelCell`` - the index of the top valid layer

      * ``maxLevelCell`` - the index of the bottom valid layer

      * ``cellMask`` - a mask of where cells are valid

      * ``layerThickness`` - the thickness of each layer

      * ``restingThickness`` - the thickness of each layer stretched as if
        ``ssh = 0``

      * ``zMid`` - the elevation of the midpoint of each layer

    So far, all supported coordinates make use of a 1D reference vertical grid.
    The following variables associated with that field are also added to the
    mesh:

      * ``refTopDepth`` - the positive-down depth of the top of each ref. level

      * ``refZMid`` - the positive-down depth of the middle of each ref. level

      * ``refBottomDepth`` - the positive-down depth of the bottom of each ref.
        level

      * ``refInterfaces`` - the positive-down depth of the interfaces between
        ref. levels (with ``nVertLevels`` + 1 elements).

      * ``vertCoordMovementWeights`` - the weights (all ones) for coordinate
        movement

    There is considerable redundancy between these variables but each is
    sometimes convenient.

    Parameters
    ----------
    config : configparser.ConfigParser
        Configuration options with parameters used to construct the vertical
        grid

    ds : xarray.Dataset
        A data set containing ``bottomDepth`` and ``ssh`` variables used to
        construct the vertical coordinate
    """
    add_1d_grid(config, ds)

    ds['vertCoordMovementWeights'] = xarray.ones_like(ds.refBottomDepth)

    ds['minLevelCell'], ds['maxLevelCell'], ds['cellMask'] = \
        compute_min_max_level_cell(ds.refTopDepth, ds.refBottomDepth, ds.ssh,
                                   ds.bottomDepth)

    ds['bottomDepth'], ds['maxLevelCell'] = alter_bottom_depth(
        config, ds.bottomDepth, ds.refBottomDepth, ds.maxLevelCell)

    ds['ssh'], ds['minLevelCell'] = alter_ssh(config, ds.ssh,
                                              ds.refBottomDepth,
                                              ds.minLevelCell)

    ds['layerThickness'] = compute_z_level_layer_thickness(
        ds.refTopDepth, ds.refBottomDepth, ds.ssh, ds.bottomDepth,
        ds.minLevelCell, ds.maxLevelCell)

    ds['restingThickness'] = compute_z_level_resting_thickness(
        ds.layerThickness, ds.ssh, ds.bottomDepth, ds.minLevelCell,
        ds.maxLevelCell)
Example #11
0
def broadcast_lonlat(ds):
    ds.coords['lon'] = ds['lon'] * xr.ones_like(ds['lat'])
    ds.coords['lat'] = xr.ones_like(ds['lon']) * ds['lat']
    return ds
def test_replace_x_y_nominal_lat_lon(dask, nans):
    x = np.linspace(0, 720, 10)
    y = np.linspace(-200, 140, 5)
    lon = xr.DataArray(np.linspace(0, 360, len(x)), coords=[("x", x)])
    lat = xr.DataArray(np.linspace(-90, 90, len(y)), coords=[("y", y)])
    llon = lon * xr.ones_like(lat)
    llat = xr.ones_like(lon) * lat

    data = np.random.rand(len(x), len(y))
    ds = xr.DataArray(data, coords=[("x", x),
                                    ("y", y)]).to_dataset(name="data")
    ds.coords["lon"] = llon
    ds.coords["lat"] = llat

    if nans:
        lon = ds["lon"].load().data
        lon[0, :] = np.nan
        lon[-1, :] = np.nan
        lon[:, 0] = np.nan
        lon[:, -1] = np.nan
        lon[15:23, 23:26] = np.nan
        ds["lon"].data = lon

        # for lats put only some nans in the middle.
        # I currently have no way to interpolate lats at the edge.
        lat = ds["lat"].load().data
        lat[15:23, 23:26] = np.nan
        ds["lat"].data = lat

    if dask:
        ds = ds.chunk({"x": -1, "y": -1})
        ds.coords["lon"] = ds.coords["lon"].chunk({"x": -1, "y": -1})
        ds.coords["lat"] = ds.coords["lat"].chunk({"x": -1, "y": -1})

    replaced_ds = replace_x_y_nominal_lat_lon(ds)

    assert all(~np.isnan(replaced_ds.x))
    assert all(~np.isnan(replaced_ds.y))

    assert all(replaced_ds.x.diff("x") > 0)
    assert all(replaced_ds.y.diff("y") > 0)
    assert len(replaced_ds.lon.shape) == 2
    assert len(replaced_ds.lat.shape) == 2
    assert set(replaced_ds.lon.dims) == set(["x", "y"])
    assert set(replaced_ds.lat.dims) == set(["x", "y"])
    assert all(~np.isnan(replaced_ds.x))
    assert all(~np.isnan(replaced_ds.y))

    # test a dataset that would result in duplicates with current method
    x = np.linspace(0, 720, 4)
    y = np.linspace(-200, 140, 3)
    llon = xr.DataArray(
        np.array([[0, 50, 100, 150], [0, 50, 100, 150], [0, 50, 100, 150]]),
        coords=[("y", y), ("x", x)],
    )
    llat = xr.DataArray(
        np.array([[0, 0, 10, 0], [10, 0, 0, 0], [20, 20, 20, 20]]),
        coords=[("y", y), ("x", x)],
    )
    data = np.random.rand(len(x), len(y))
    ds = xr.DataArray(data, coords=[("x", x),
                                    ("y", y)]).to_dataset(name="data")
    ds.coords["lon"] = llon
    ds.coords["lat"] = llat

    if dask:
        ds = ds.chunk({"x": -1, "y": -1})
        ds.coords["lon"] = ds.coords["lon"].chunk({"x": -1, "y": -1})
        ds.coords["lat"] = ds.coords["lat"].chunk({"x": -1, "y": -1})

    replaced_ds = replace_x_y_nominal_lat_lon(ds)
    assert all(~np.isnan(replaced_ds.x))
    assert all(~np.isnan(replaced_ds.y))
    assert len(replaced_ds.y) == len(np.unique(replaced_ds.y))
    assert len(replaced_ds.x) == len(np.unique(replaced_ds.x))
    # make sure values are sorted in ascending order
    assert all(replaced_ds.x.diff("x") > 0)
    assert all(replaced_ds.y.diff("y") > 0)
    assert len(replaced_ds.lon.shape) == 2
    assert len(replaced_ds.lat.shape) == 2
    assert set(replaced_ds.lon.dims) == set(["x", "y"])
    assert set(replaced_ds.lat.dims) == set(["x", "y"])
    def _preprocess_single(
        self,
        shp_filepath: Path,
        reference_nc_filepath: Path,
        var_name: str,
        lookup_colname: str,
        save: bool = True,
    ) -> Optional[xr.Dataset]:
        """ Preprocess .shp admin boundary files into an `.nc`
        file with the same shape as reference_nc_filepath.

        Will create categorical .nc file which will specify
        which admin region each pixel is in.

        Arguments
        ----------
        shp_filepath: Path
            The path to the shapefile

        reference_nc_filepath: Path
            The path to the netcdf file with the shape
            (must have been run through Preprocessors prior to using)

        var_name: str
            the name of the Variable in the xr.Dataset and the name
            of the output filename - {var_name}_{self.country}.nc

        lookup_colname: str
            the column name to lookup in the shapefile
            (read in as geopandas.GeoDataFrame)
        """
        filename = self.get_filename(var_name)
        if (self.out_dir / filename).exists():
            print("** Data already preprocessed! **\nIf you need to "
                  "process again then move or delete existing file"
                  f" at: {(self.out_dir / filename).as_posix()}")
            return None

        assert "interim" in reference_nc_filepath.parts, (
            "Expected "
            "the target data to have been preprocessed by the pipeline")

        # MUST have a target dataset to create the same shape
        target_ds = xr.ones_like(xr.open_dataset(reference_nc_filepath))
        data_var = [d for d in target_ds.data_vars][0]
        da = target_ds[data_var]

        # turn the shapefile into a categorical variable (like landcover)
        shp_to_nc = SHPtoXarray()
        ds = shp_to_nc.shapefile_to_xarray(
            da=da,
            shp_path=shp_filepath,
            var_name=var_name,
            lookup_colname=lookup_colname,
        )

        # save the data
        if save:
            print(f"Saving to {self.out_dir}")

            if self.analysis is True:
                assert self.out_dir.parts[-2] == "analysis", (
                    "self.analysis should"
                    "be True and the output directory should be analysis")

            ds.to_netcdf(self.out_dir / filename)

            print(f"** {(self.out_dir / filename).as_posix()} saved! **")

            return None
        else:
            return ds
 def init(ds):
     return xr.ones_like(ds.isel(time=0))
def broadcast_lonlat(ds):
    ds.coords["lon"] = ds["lon"] * xr.ones_like(ds["lat"])
    ds.coords["lat"] = xr.ones_like(ds["lon"]) * ds["lat"]
    return ds
Example #16
0
def compute_diagnostics(
    state: State, tendency: State, label: str, hydrostatic: bool
) -> Diagnostics:
    delp = state[DELP]
    temperature_tendency_name = "dQ1"
    humidity_tendency_name = "dQ2"

    temperature_tendency = tendency.get(temperature_tendency_name, xr.zeros_like(delp))
    humidity_tendency = tendency.get(humidity_tendency_name, xr.zeros_like(delp))

    # compute column-integrated diagnostics
    if hydrostatic:
        net_heating = vcm.column_integrated_heating_from_isobaric_transition(
            temperature_tendency, delp, "z"
        )
    else:
        net_heating = vcm.column_integrated_heating_from_isochoric_transition(
            temperature_tendency, delp, "z"
        )
    diags: Diagnostics = {
        f"net_moistening_due_to_{label}": vcm.mass_integrate(
            humidity_tendency, delp, dim="z"
        ).assign_attrs(
            units="kg/m^2/s",
            description=f"column integrated moisture tendency due to {label}",
        ),
        f"column_heating_due_to_{label}": net_heating.assign_attrs(
            units="W/m^2"
        ).assign_attrs(description=f"column integrated heating due to {label}"),
    }
    delp_tendency = STATE_NAME_TO_TENDENCY[DELP]
    if delp_tendency in tendency:
        net_mass_tendency = vcm.mass_integrate(
            xr.ones_like(tendency[delp_tendency]), tendency[delp_tendency], dim="z"
        ).assign_attrs(
            units="kg/m^2/s",
            description=f"column-integrated mass tendency due to {label}",
        )
        diags[f"net_mass_tendency_due_to_{label}"] = net_mass_tendency

    # add 3D tendencies to diagnostics
    if label == "nudging":
        diags_3d: Mapping[Hashable, xr.DataArray] = {
            f"{TENDENCY_TO_STATE_NAME[k]}_tendency_due_to_nudging": v
            for k, v in tendency.items()
        }
    elif label == "machine_learning":
        diags_3d = {
            "dQ1": temperature_tendency.assign_attrs(units="K/s").assign_attrs(
                description=f"air temperature tendency due to {label}"
            ),
            "dQ2": humidity_tendency.assign_attrs(units="kg/kg/s").assign_attrs(
                description=f"specific humidity tendency due to {label}"
            ),
        }
    diags.update(diags_3d)

    # add 3D state to diagnostics for backwards compatibility
    diags.update({TEMP: state[TEMP], SPHUM: state[SPHUM], DELP: state[DELP]})

    return diags
Example #17
0
def get_section_trsp(fldx, fldy, grid, left, right, nx=100, is_normal=True):
    """
    Interpolate a vector field to a section line, returning
    the normal component

    Note: DIRECTION NEEDS TO BE VERIFIED!

    Parameters
    ----------
    fldx, fldy : xarray DataArray
        Containing vector field to grab along section
    left, right : tuple or list of 2 floats
        Containing lon/lat bounding points
    nx : int, optional
        Number of interpolation points

    Returns
    -------
    q : xarray DataArray
        with interpolated vector field into section, dimension i
        and xc/yc as lon/lat along section, dim i
    """

    # Create x/y coords for line
    x = np.linspace(left[0], right[0], nx + 1)
    y = np.linspace(left[1], right[1], nx + 1)

    # interp to mid point
    # create an index variable: i
    # interpolated result will live along this coordinate
    xc = xr.DataArray(_mov_avg(x), dims='i')
    yc = xr.DataArray(_mov_avg(y), dims='i')

    # Look for a mask for valid points
    maskW = fldx.maskW if 'maskW' in fldx.coords else True * xr.ones_like(fldx)
    maskS = fldy.maskS if 'maskS' in fldy.coords else True * xr.ones_like(fldy)

    # interpolate U and V to this point
    uvel = grid.interp(fldx.where(maskW, np.NAN), 'X',
                       boundary='fill').interp(XC=xc, YC=yc)
    vvel = grid.interp(fldy.where(maskS, np.NAN), 'Y',
                       boundary='fill').interp(XC=xc, YC=yc)

    # get coordinate system tangent and normal to this line
    dxc = xr.DataArray(np.diff(x), dims='i')
    dyc = xr.DataArray(np.diff(y), dims='i')
    sin = dyc / np.sqrt(dxc**2 + dyc**2)
    cos = dxc / np.sqrt(dxc**2 + dyc**2)

    # This rotation uses a negative angle rotation:
    # https://en.wikipedia.org/wiki/Rotation_matrix#Direction
    # consider purely zonal flow: (u,v) = (1,0)
    # and a low angle rotation, say theta=15
    # i.e. xaxis from "->" to "/^", but less dramatic  than I can draw here
    # then in the new coordinate system, this flow will
    # be mostly positive in the new zonal direction (close to one)
    # but the new v component will be slightly negative
    if is_normal:
        q = -sin * uvel + cos * vvel
    else:
        q = cos * uvel + sin * vvel
    myname = fldx.name[:-1]  #drop the W,S
    q.name = myname

    # add xc,yc
    q = q.to_dataset()
    q['xc'] = xc.copy()
    q['yc'] = yc.copy()
    q = q.set_coords('xc')
    q = q.set_coords('yc')

    return q[myname]
def _tseries_gen(varname, component, ensemble, entries, cluster_in):
    """
    generate a tseries for a particular ensemble member, return a Dataset object
    """
    print_timestamp(f"varname={varname}")
    varname_resolved = _varname_resolved(varname, component)
    fnames = entries.loc[entries["ensemble"] == ensemble].files.tolist()
    print(fnames)

    with open(var_specs_fname, mode="r") as fptr:
        var_specs_all = yaml.safe_load(fptr)

    if varname in var_specs_all[component]["vars"]:
        var_spec = var_specs_all[component]["vars"][varname]
    else:
        var_spec = {}

    # use var specific reduce_dims if it exists, otherwise use reduce_dims for component
    if "reduce_dims" in var_spec:
        reduce_dims = var_spec["reduce_dims"]
    else:
        reduce_dims = var_specs_all[component]["reduce_dims"]

    # get rank of varname from first file, used to set time chunksize
    # approximate number of time levels, assuming all files have same number
    # save time encoding from first file, to restore it in the multi-file case
    #     https://github.com/pydata/xarray/issues/2921
    with xr.open_dataset(fnames[0]) as ds0:
        vardims = ds0[varname_resolved].dims
        rank = len(vardims)
        vertlen = ds0.dims[vardims[1]] if rank > 3 else 0
        time_chunksize = 10 * 12 if rank < 4 else 6
        ds0.chunk(chunks={time_name: time_chunksize})
        time_encoding = ds0[time_name].encoding
        var_encoding = ds0[varname_resolved].encoding
        ds0_attrs = ds0.attrs
        ds0_encoding = ds0.encoding
        drop_var_names_loc = drop_var_names(component, ds0, varname_resolved)

    # instantiate cluster, if not provided via argument
    # ignore dashboard warnings when instantiating
    if cluster_in is None:
        if "ncar_jobqueue" in sys.modules:
            with warnings.catch_warnings():
                warnings.filterwarnings(action="ignore", module=".*dashboard")
                cluster = ncar_jobqueue.NCARCluster()
        else:
            raise ValueError(
                "cluster_in not provided and ncar_jobqueue did not load successfully"
            )
    else:
        cluster = cluster_in

    workers = 12
    if vertlen >= 20:
        workers *= 2
    if vertlen >= 60:
        workers *= 2
    workers = 2 * round(workers / 2)  # round to nearest multiple of 2
    print_timestamp(f"calling cluster.scale({workers})")
    cluster.scale(workers)

    print_timestamp(f"dashboard_link={cluster.dashboard_link}")

    # create dask distributed client, connecting to workers
    with dask.distributed.Client(cluster) as client:
        print_timestamp("client instantiated")

        # tool to help track down file inconsistencies that trigger errors in open_mfdataset
        # test_open_mfdataset(fnames, time_chunksize, varname)

        # data_vars = "minimal", to avoid introducing time dimension to time-invariant fields when there are multiple files
        # only chunk in time, because if you chunk over spatial dims, then sum results depend on chunksize
        #     https://github.com/pydata/xarray/issues/2902
        with xr.open_mfdataset(
            fnames,
            data_vars="minimal",
            coords="minimal",
            compat="override",
            combine="by_coords",
            chunks={time_name: time_chunksize},
            drop_variables=drop_var_names_loc,
        ) as ds_in:
            print_timestamp("open_mfdataset returned")

            # restore encoding for time from first file
            ds_in[time_name].encoding = time_encoding

            da_in_full = ds_in[varname_resolved]
            da_in_full.encoding = var_encoding

            var_units = clean_units(da_in_full.attrs["units"])
            if "unit_conv" in var_spec:
                var_units = f"({var_spec['unit_conv']})({var_units})"

            # construct averaging/integrating weight
            weight = get_weight(ds_in, component, reduce_dims)
            weight_attrs = weight.attrs
            weight = get_rmask(ds_in, component) * weight
            weight.attrs = weight_attrs
            print_timestamp("weight constructed")

            # compute regional sum of weights
            da_in_t0 = da_in_full.isel({time_name: 0}).drop(time_name)
            ones_masked_t0 = xr.ones_like(da_in_t0).where(da_in_t0.notnull())
            weight_sum = (ones_masked_t0 * weight).sum(dim=reduce_dims)
            weight_sum.name = f"weight_sum_{varname}"
            weight_sum.attrs = weight.attrs
            weight_sum.attrs[
                "long_name"
            ] = f"sum of weights used in tseries generation for {varname}"

            tlen = da_in_full.sizes[time_name]
            print_timestamp(f"tlen={tlen}")

            # use var specific tseries_op if it exists, otherwise use tseries_op for component
            if "tseries_op" in var_spec:
                tseries_op = var_spec["tseries_op"]
            else:
                tseries_op = var_specs_all[component]["tseries_op"]

            ds_out_list = []

            time_step_nominal = min(2 * workers * time_chunksize, tlen)
            time_step = math.ceil(tlen / (tlen // time_step_nominal))
            print_timestamp(f"time_step={time_step}")
            for time_ind0 in range(0, tlen, time_step):
                print_timestamp(f"time_ind={time_ind0}, {time_ind0 + time_step}")
                da_in = da_in_full.isel(
                    {time_name: slice(time_ind0, time_ind0 + time_step)}
                )

                if tseries_op == "integrate":
                    da_out = (da_in * weight).sum(dim=reduce_dims)
                    da_out.name = varname
                    da_out.attrs["long_name"] = "Integrated " + da_in.attrs["long_name"]
                    da_out.attrs["units"] = cf_units.Unit(
                        f"({weight.attrs['units']})({var_units})"
                    ).format()
                elif tseries_op == "average":
                    da_out = (da_in * weight).sum(dim=reduce_dims)
                    ones_masked = xr.ones_like(da_in).where(da_in.notnull())
                    denom = (ones_masked * weight).sum(dim=reduce_dims)
                    da_out /= denom
                    da_out.name = varname
                    da_out.attrs["long_name"] = "Averaged " + da_in.attrs["long_name"]
                    da_out.attrs["units"] = cf_units.Unit(var_units).format()
                else:
                    msg = f"tseries_op={tseries_op} not implemented"
                    raise NotImplementedError(msg)

                print_timestamp("da_out computation setup")

                # propagate some settings from da_in to da_out
                da_out.encoding["dtype"] = da_in.encoding["dtype"]
                copy_fill_settings(da_in, da_out)

                ds_out = da_out.to_dataset()

                print_timestamp("ds_out generated")

                # copy particular variables from ds_in
                copy_var_list = [time_name]
                if "bounds" in ds_in[time_name].attrs:
                    copy_var_list.append(ds_in[time_name].attrs["bounds"])
                copy_var_list.extend(copy_var_names(component))
                ds_out = xr.merge(
                    [
                        ds_out,
                        ds_in[copy_var_list].isel(
                            {time_name: slice(time_ind0, time_ind0 + time_step)}
                        ),
                    ]
                )

                print_timestamp("copy_var_names added")

                # force computation of ds_out, while resources of client are still available
                print_timestamp("calling ds_out.load")
                ds_out_list.append(ds_out.load())
                print_timestamp("returned from ds_out.load")

            print_timestamp("concatenating ds_out_list datasets")
            ds_out = xr.concat(
                ds_out_list,
                dim=time_name,
                data_vars=[varname],
                coords="minimal",
                compat="override",
            )

            # set ds_out.time to mid-interval values
            ds_out = time_set_mid(ds_out, time_name)

            print_timestamp("time_set_mid returned")

            # copy file attributes
            ds_out.attrs = ds0_attrs

            datestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
            msg = f"{datestamp}: created by {__file__}"
            if "history" in ds_out.attrs:
                ds_out.attrs["history"] = "\n".join([msg, ds_out.attrs["history"]])
            else:
                ds_out.attrs["history"] = msg

            ds_out.attrs["input_file_list"] = " ".join(fnames)

            for key in ["unlimited_dims"]:
                if key in ds0_encoding:
                    ds_out.encoding[key] = ds0_encoding[key]

            # restore encoding for time from first file
            ds_out[time_name].encoding = time_encoding

            # change output units, if specified in var_spec
            units_key = (
                "integral_display_units"
                if tseries_op == "integrate"
                else "display_units"
            )
            if units_key in var_spec:
                ds_out[varname] = conv_units(ds_out[varname], var_spec[units_key])
                print_timestamp("units converted")

            # add regional sum of weights
            ds_out[weight_sum.name] = weight_sum

    print_timestamp("ds_in and client closed")

    # if cluster was instantiated here, close it
    if cluster_in is None:
        cluster.close()

    return ds_out
Example #19
0
    def run(self):
        """
        Run this step of the test case
        """
        # create the mesh and graph.info
        super().run()

        config = self.config

        section = config['horizontal_advection']
        temperature = section.getfloat('temperature')
        salinity = section.getfloat('salinity')
        x_center = 1e3*section.getfloat('x_center')
        y_center = 1e3*section.getfloat('y_center')
        advect_x = section.getboolean('advect_x')
        advect_y = section.getboolean('advect_y')
        gaussian_width = 1e3*section.getfloat('gaussian_width')

        section = config['planar_convergence']
        duration = 3600.*section.getfloat('duration')
        dt_1km = section.getint('dt_1km')
        resolution = float(self.resolution)
        dt = dt_1km * resolution
        dc = resolution*1e3

        ds = xarray.open_dataset('mesh.nc')
        xCell = ds.xCell
        yCell = ds.yCell

        bottom_depth = config.getfloat('vertical_grid', 'bottom_depth')

        ds['bottomDepth'] = bottom_depth * xarray.ones_like(xCell)
        ds['ssh'] = xarray.zeros_like(xCell)

        init_vertical_coord(config, ds)

        if advect_x:
            x_vel = ds.attrs['x_period']/duration
            x_cfl = x_vel*dt/dc
            print(f'x_cfl: {x_cfl}')
        else:
            x_vel = 0.

        if advect_y:
            y_vel = ds.attrs['y_period']/duration
            y_cfl = y_vel*dt/dc
            print(f'y_cfl: {y_cfl}')
        else:
            y_vel = 0.

        temperature = temperature*xarray.ones_like(xCell)
        temperature, _ = xarray.broadcast(temperature, ds.refBottomDepth)
        temperature = temperature.transpose('nCells', 'nVertLevels')
        temperature = temperature.expand_dims(dim='Time', axis=0)

        salinity = salinity*xarray.ones_like(temperature)

        angleEdge = ds.angleEdge
        normalVelocity = (numpy.cos(angleEdge) * x_vel +
                          numpy.sin(angleEdge) * y_vel)
        normalVelocity, _ = xarray.broadcast(normalVelocity, ds.refBottomDepth)
        normalVelocity = normalVelocity.transpose('nEdges', 'nVertLevels')
        normalVelocity = normalVelocity.expand_dims(dim='Time', axis=0)

        dist_sq = (xCell - x_center)**2 + (yCell - y_center)**2

        tracer1 = numpy.exp(-0.5*dist_sq/gaussian_width**2)
        tracer1, _ = xarray.broadcast(tracer1, ds.refBottomDepth)
        tracer1 = tracer1.transpose('nCells', 'nVertLevels')
        tracer1 = tracer1.expand_dims(dim='Time', axis=0)

        ds['temperature'] = temperature
        ds['salinity'] = salinity * xarray.ones_like(temperature)
        ds['normalVelocity'] = normalVelocity
        ds['fCell'] = xarray.zeros_like(xCell)
        ds['fEdge'] = xarray.zeros_like(ds.xEdge)
        ds['fVertex'] = xarray.zeros_like(ds.xVertex)
        ds['tracer1'] = tracer1

        write_netcdf(ds, 'initial_state.nc')
Example #20
0
import numpy as np
import xarray as xr

airds = xr.tutorial.open_dataset("air_temperature").isel(time=slice(4),
                                                         lon=slice(50))
airds.air.attrs["cell_measures"] = "area: cell_area"
airds.air.attrs["standard_name"] = "air_temperature"
airds.coords["cell_area"] = (xr.DataArray(np.cos(airds.lat * np.pi / 180)) *
                             xr.ones_like(airds.lon) * 105e3 * 110e3)

ds_no_attrs = airds.copy(deep=True)
for variable in ds_no_attrs.variables:
    ds_no_attrs[variable].attrs = {}

popds = xr.Dataset()
popds.coords["TLONG"] = (
    ("nlat", "nlon"),
    np.ones((20, 30)),
    {
        "units": "degrees_east"
    },
)
popds.coords["TLAT"] = (
    ("nlat", "nlon"),
    2 * np.ones((20, 30)),
    {
        "units": "degrees_north"
    },
)
popds.coords["ULONG"] = (
    ("nlat", "nlon"),
Example #21
0
def change_significance(
    fut: Union[xr.DataArray, xr.Dataset],
    ref: Union[xr.DataArray, xr.Dataset] = None,
    test: str = "ttest",
    **kwargs,
) -> Tuple[Union[xr.DataArray, xr.Dataset], Union[xr.DataArray, xr.Dataset]]:
    """Robustness statistics qualifying how the members of an ensemble agree on the existence of change and on its sign.

    Parameters
    ----------
    fut : Union[xr.DataArray, xr.Dataset]
      Future period values along 'realization' and 'time' (..., nr, nt1)
      or if `ref` is None, Delta values along `realization` (..., nr).
    ref : Union[xr.DataArray, xr.Dataset], optional
      Reference period values along realization' and 'time'  (..., nt2, nr).
      The size of the 'time' axis does not need to match the one of `fut`.
      But their 'realization' axes must be identical.
      If `None` (default), values of `fut` are assumed to be deltas instead of
      a distribution across the future period.
      `fut` and `ref` must be of the same type (Dataset or DataArray). If they are
      Dataset, they must have the same variables (name and coords).
    test : {'ttest', 'welch-ttest', 'threshold', None}
      Name of the statistical test used to determine if there was significant change. See notes.
    kwargs
      Other arguments specific to the statistical test.

      For 'ttest' and 'welch-ttest':
        p_change : float (default : 0.05)
          p-value threshold for rejecting the hypothesis of no significant change.
      For 'threshold': (Only one of those must be given.)
        abs_thresh : float (no default)
          Threshold for the (absolute) change to be considered significative.
        rel_thresh : float (no default, in [0, 1])
          Threshold for the relative change (in reference to ref) to be significative.
          Only valid if `ref` is given.

    Returns
    -------
    change_frac
      The fraction of members that show significant change [0, 1].
      Passing `test=None` yields change_frac = 1 everywhere. Same type as `fut`.
    pos_frac
      The fraction of members showing significant change that show a positive change ]0, 1].
      Null values are returned where no members show significant change.

      The table below shows the coefficient needed to retrieve the number of members
      that have the indicated characteristics, by multiplying it to the total
      number of members (`fut.realization.size`).

      +-----------------+------------------------------+------------------------+
      |                 | Significant change           | Non-significant change |
      +-----------------+------------------------------+------------------------+
      | Any direction   | change_frac                  | 1 - change_frac        |
      +-----------------+------------------------------+------------------------+
      | Positive change | pos_frac * change_frac       | N.A.                   |
      +-----------------+------------------------------+                        |
      | Negative change | (1 - pos_frac) * change_frac |                        |
      +-----------------+------------------------------+------------------------+

    Notes
    -----
    Available statistical tests are :

      'ttest' :
        Single sample T-test. Same test as used by [tebaldi2011]_. The future
        values are compared against the reference mean (over 'time'). Change is qualified
        as 'significant' when the test's p-value is below the user-provided `p_change`
        value.
      'welch-ttest' :
        Two-sided T-test, without assuming equal population variance. Same
        significance criterion as 'ttest'.
      'threshold' :
        Change is considered significative if the absolute delta exceeds a given
        threshold (absolute or relative).
      None :
        Significant change is not tested and, thus, members showing no change are
        included in the `sign_frac` output.

    References
    ----------
    .. [tebaldi2011] Tebaldi C., Arblaster, J.M. and Knutti, R. (2011) Mapping model agreement on future climate projections. GRL. doi:10.1029/2011GL049863


    Example
    -------
    This example computes the mean temperature in an ensemble and compares two time
    periods, qualifying significant change through a single sample T-test.

    >>> from xclim import ensembles
    >>> ens = ensembles.create_ensemble(temperature_datasets)
    >>> tgmean = xclim.atmos.tg_mean(tas=ens.tas, freq='YS')
    >>> fut = tgmean.sel(time=slice('2020', '2050'))
    >>> ref = tgmean.sel(time=slice('1990', '2020'))
    >>> chng_f, pos_f = ensembles.change_significance(fut, ref, test='ttest')

    If the deltas were already computed beforehand, the 'threshold' test can still
    be used, here with a 2 K threshold.

    >>> delta = fut.mean('time') - ref.mean('time')
    >>> chng_f, pos_f = ensembles.change_significance(delta, test='threshold', abs_thresh=2)
    """
    test_params = {
        "ttest": ["p_change"],
        "welch-ttest": ["p_change"],
        "threshold": ["abs_thresh", "rel_thresh"],
    }
    changed = None
    if ref is None:
        delta = fut
        n_valid_real = delta.notnull().sum("realization")
        if test not in ["threshold", None]:
            raise ValueError(
                "When deltas are given (ref=None), 'test' must be one of ['threshold', None]"
            )
    else:
        delta = fut.mean("time") - ref.mean("time")
        n_valid_real = fut.notnull().all("time").sum("realization")

    if test == "ttest":
        p_change = kwargs.setdefault("p_change", 0.05)

        # Test hypothesis of no significant change
        pvals = xr.apply_ufunc(
            lambda f, r: spstats.ttest_1samp(f, r, axis=-1, nan_policy="omit")[
                1],
            fut,
            ref.mean("time"),
            input_core_dims=[["realization", "time"], ["realization"]],
            output_core_dims=[["realization"]],
            vectorize=True,
            dask="parallelized",
            output_dtypes=[float],
        )
        # When p < p_change, the hypothesis of no significant change is rejected.
        changed = pvals < p_change
    elif test == "welch-ttest":
        p_change = kwargs.setdefault("p_change", 0.05)

        # Test hypothesis of no significant change
        # equal_var=False -> Welch's T-test
        pvals = xr.apply_ufunc(
            lambda f, r: spstats.ttest_ind(
                f, r, axis=-1, equal_var=False, nan_policy="omit")[1],
            fut,
            ref,
            input_core_dims=[["realization", "time"], ["realization", "time"]],
            output_core_dims=[["realization"]],
            exclude_dims={"time"},
            vectorize=True,
            dask="parallelized",
            output_dtypes=[float],
        )

        # When p < p_change, the hypothesis of no significant change is rejected.
        changed = pvals < p_change
    elif test == "threshold":
        if "abs_thresh" in kwargs and "rel_thresh" not in kwargs:
            changed = abs(delta) > kwargs["abs_thresh"]
        elif "rel_thresh" in kwargs and "abs_thresh" not in kwargs and ref is not None:
            changed = abs(delta / ref.mean("time")) > kwargs["rel_thresh"]
        else:
            raise ValueError(
                "Invalid argument combination for test='threshold'.")
    elif test is not None:
        raise ValueError(
            f"Statistical test {test} must be one of {', '.join(test_params.keys())}."
        )

    if test is not None:
        delta_chng = delta.where(changed)
        change_frac = changed.sum("realization") / n_valid_real
    else:
        delta_chng = delta
        change_frac = xr.ones_like(delta.isel(realization=0))

    # Test that models agree on the sign of the change
    # This returns NaN (cause 0 / 0) where no model show significant change.
    pos_frac = (delta_chng > 0).sum("realization") / (change_frac *
                                                      n_valid_real)

    # Metadata
    kwargs_str = ", ".join(
        [f"{k}: {v}" for k, v in kwargs.items() if k in test_params[test]])
    test_str = (
        f"Significant change was tested with test {test} with parameters {kwargs_str}."
    )
    das = {"fut": fut} if ref is None else {"fut": fut, "ref": ref}
    pos_frac.attrs.update(
        description=
        "Fraction of members showing significant change that agree on a positive change. "
        + test_str,
        units="",
        test=str(test),
        history=update_history(
            f"pos_frac from change_significance(fut=fut, ref=ref, test={test}, {kwargs_str})",
            **das,
        ),
    )
    change_frac.attrs.update(
        description="Fraction of members showing significant change. " +
        test_str,
        units="",
        test=str(test),
        history=update_history(
            f"change_frac from change_significance(fut=fut, ref=ref, test={test}, {kwargs_str})",
            **das,
        ),
    )
    return change_frac, pos_frac
Example #22
0
def truncate_dataarray(dataarray,
                       quantile_dims,
                       replace_with_mean=False,
                       mean_dims=None,
                       weights=None,
                       quantiles=None,
                       extra_dim=None):
    r"""Truncates the dataarray over the given dimensions, meaning that data
    outside the upper and lower quantiles, which are taken across the
    dimensions ``quantile_dims``, are replaced either with:
    1. the upper and lower quantiles themselves.
    2. or with the mean of the in-lier data, which is taken across the
       dimensions given by ``mean_dims``.

    **Note**: If weights are given, then weighted-quantiles and weighted-means
    are taken, otherwise the quantiles and means are unweighted.

    Args:
        dataarray (xarray.DataArray):
            dataarray that has at least the dimensions given by ``dims``, and
            if ``replace_with_mean`` is True, then also ``mean_dims``.
        replace_with_mean (bool, optional):
            If True, then replace values outside of the upper and lower
            quantiles and with the mean across the dimensions given by
            `mean_dims`, if False, then replace with the upper and lower bounds
            themselves.
        mean_dims (list[str], optional):
            dimensions to take mean within the bounds over
        quantile_dims (list[str]):
            dimensions to take quantiles over -- the quantiles are
            used to make the bounds.
        weights (xarray.DataArray, optional):
            Must have one dimension and can have up two dimensions.
        quantiles (tuple[float, float] | list[float, float], optional):
            The tuple of two floats representing the quantiles to take.
        extra_dim (str):
            Extra dimension that exists in `weights` and `data`. It should not
            be in `stat_dims`.
    Returns:
        (xarray.DataArray):
            Same shape as the original array, but with truncated values.
    Raises:
        (ValueError):
            If `replace_with_mean` is True, and `mean_dims` is not list of
            strings.
    """
    LOGGER.debug("Entering the `truncate_dataarray` function")

    LOGGER.debug("quantile_dims:{}".format(quantile_dims))
    LOGGER.debug("replace_with_mean:{}".format(replace_with_mean))
    LOGGER.debug("mean_dims:{}".format(mean_dims))
    LOGGER.debug("weights:{}".format(weights))
    LOGGER.debug("quantiles:{}".format(quantiles))
    LOGGER.debug("extra_dim:{}".format(extra_dim))

    if replace_with_mean and not mean_dims:
        mean_dims_err_msg = (
            "If `replace_with_mean` is True, then `mean_dims` "
            "must be a list of strings")
        LOGGER.error(mean_dims_err_msg)
        raise ValueError(mean_dims_err_msg)
    else:
        pass  # `mean_dims` doesn't can be None

    quantiles = (Quantiles(
        *sorted(quantiles)) if quantiles else Quantiles(0.05, 0.95))

    if weights is not None:
        quantile_values = weighted_quantile_with_extra_dim(
            dataarray, quantiles, list(quantile_dims), weights, extra_dim)
    else:
        quantile_values = dataarray.quantile(quantiles,
                                             dim=list(quantile_dims))
    lower_da = quantile_values.sel(quantile=quantiles.lower)
    upper_da = quantile_values.sel(quantile=quantiles.upper)

    if replace_with_mean:
        good_indexes = (dataarray >= lower_da) & (dataarray <= upper_da)
        inside_da = dataarray.where(good_indexes)
        outside_da = dataarray.where(~good_indexes)

        if weights is not None:
            inside_mean_da = weighted_mean_with_extra_dim(
                inside_da, mean_dims, weights, extra_dim)
        else:
            inside_mean_da = inside_da.mean(mean_dims)

        truncated_da = (inside_da.combine_first(
            xr.ones_like(outside_da) * inside_mean_da))
    else:
        expanded_lower_da, _ = xr.broadcast(lower_da, dataarray)
        expanded_lower_da = expanded_lower_da.transpose(*dataarray.coords.dims)

        expanded_upper_da, _ = xr.broadcast(upper_da, dataarray)
        expanded_upper_da = expanded_upper_da.transpose(*dataarray.coords.dims)

        truncated_da = dataarray.clip(min=expanded_lower_da,
                                      max=expanded_upper_da)
    LOGGER.debug("Leaving the `truncate_dataarray` function")
    return truncated_da
Example #23
0
def region(xds,
           name='region1',
           ra=None,
           dec=None,
           pixels=None,
           pol=-1,
           channels=-1):
    """
    Create a new region Data variable in the Dataset \n
    
    .. note:: This function currently only supports rectangles and integer pixel boundaries
    
    Parameters
    ----------
    xds : xarray.core.dataset.Dataset
        input image dataset
    name : str
        dataset variable name for region, overwrites if already present
    ra : list
        right ascension coordinate range in the form of [min, max]. Default None means all
    dec : list
        declination coordinate range in the form of [min, max]. Default None means all
    pixels : array_like
        array of shape (N,2) containing pixel box. OR'd with ra/dec
    pol : int or list
        polarization dimension(s) to include in region.  Default of -1 means all
    channels : int or list
        channel dimension(s) to include in region.  Default of -1 means all
        
    Returns
    -------
    xarray.core.dataset.Dataset
        New Dataset
    """
    import numpy as np
    import dask.array as da
    import xarray as xr

    # type checking/conversion
    if not name.strip(): name = 'regionX'
    if ra is None: ra = [0.0, 0.0]
    if dec is None: dec = [0.0, 0.0]
    if pixels is None: pixels = np.zeros((1, 2), dtype=int) - 1
    pixels = np.array(pixels, dtype=int)
    if (pixels.ndim != 2) or (pixels.shape[1] != 2):
        print('ERROR: pixels parameter not a (N,2) array')
        return None
    pol = np.array(np.atleast_1d(pol), dtype=int)
    if pol[0] == -1: pol = list(range(len(xds['pol'])))
    channels = np.array(np.atleast_1d(channels), dtype=int)
    if channels[0] == -1: channels = list(range(len(xds['chan'])))

    # TBD: allow arbitrary pixels, not just rectangles
    #ind_x = xr.DataArray(list(pixels[:,0]), dims=['d0'])
    #ind_y = xr.DataArray(list(pixels[:,1]), dims=['d1'])
    #region = xds.image[ind_x, ind_y]

    # TESTING only
    # ra = [2.88788, 2.88793]
    # dec = [-0.60573, -0.60568]
    # pixels = np.array([[20,40],[80,500]])

    # define region within ra/dec range
    region = xr.ones_like(xds.image, dtype=bool).where(
        (xds.right_ascension > np.min(ra)) & (xds.right_ascension < np.max(ra))
        & (xds.declination > np.min(dec)) & (xds.declination < np.max(dec)),
        False)

    # OR pixel values with ra/dec values
    #region = region | xr.ones_like(xds.image,dtype=bool).where(xds.d0.isin(pixels[:,0]) &
    #                                                           xds.d1.isin(pixels[:,1]), False)
    region = region | xr.ones_like(xds.image, dtype=bool).where(
        (xds.d0 > np.min(pixels[:, 0])) & (xds.d0 < np.max(pixels[:, 0])) &
        (xds.d1 > np.min(pixels[:, 1])) &
        (xds.d1 < np.max(pixels[:, 1])), False)

    # apply polarization and channels selections
    region = region.where(xds.pol.isin(xds.pol[pol]), False)
    region = region.where(xds.chan.isin(xds.chan[channels]), False)

    # assign region to a rest of image dataset
    xds = xds.assign(dict([(name, region)]))
    return xds
Example #24
0
def cutout(
    od,
    varList=None,
    YRange=None,
    XRange=None,
    add_Hbdr=False,
    mask_outside=False,
    ZRange=None,
    add_Vbdr=False,
    timeRange=None,
    timeFreq=None,
    sampMethod="snapshot",
    dropAxes=False,
    transformation=False,
    centered="Atlantic",
):
    """
    Cutout the original dataset in space and time
    preserving the original grid structure.

    Parameters
    ----------
    od: OceanDataset
        oceandataset to subsample
    varList: 1D array_like, str, or None
        List of variables (strings).
    YRange: 1D array_like, scalar, or None
        Y axis limits (e.g., latitudes).
        If len(YRange)>2, max and min values are used.
    XRange: 1D array_like, scalar, or None
        X axis limits (e.g., longitudes).
        If len(XRange)>2, max and min values are used.
    add_Hbdr: bool, scal
        If scalar, add and subtract `add_Hbdr` to the the horizontal range.
        of the horizontal ranges.
        If True, automatically estimate add_Hbdr.
        If False, add_Hbdr is set to zero.
    mask_outside: bool
        If True, set all values in areas outside specified (Y,X)ranges to NaNs.
        (Useful for curvilinear grids).
    ZRange: 1D array_like, scalar, or None
        Z axis limits.
        If len(ZRange)>2, max and min values are used.
    add_Vbdr: bool, scal
        If scalar, add and subtract `add_Vbdr` to the the vertical range.
        If True, automatically estimate add_Vbdr.
        If False, add_Vbdr is set to zero.
    timeRange: 1D array_like, numpy.ScalarType, or None
        time axis limits.
        If len(timeRange)>2, max and min values are used.
    timeFreq: str or None
        Time frequency.
        Available optionts are pandas Offset Aliases (e.g., '6H'):
        http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    sampMethod: {'snapshot', 'mean'}
        Downsampling method (only if timeFreq is not None).
    dropAxes: 1D array_like, str, or bool
        List of axes to remove from Grid object.
        if one point only is in the range.
        If True, set dropAxes=od.grid_coords.
        If False, preserve original grid.
    transformation: str, or bool
        Lists the transformation of the llcgrid into a new one in which face
        is no longer a dimension. Default is `False`. If `True`, need to
        define how data will be centered
    centered: str, or bool
        default is `Atlantic`, and other options is `Pacific`. This refers
        to which ocean appears centered on the data.

    Returns
    -------
    od: OceanDataset
        Subsampled oceandataset

    Notes
    -----
    If any of the horizontal ranges is not None,
    the horizontal dimensions of the cutout will have
    len(Xp1)>len(X) and len(Yp1)>len(Y)
    even if the original oceandataset had
    len(Xp1)==len(X) or len(Yp1)==len(Y).
    """

    # Checks
    unsupported_dims = ["mooring", "particle", "station"]
    check1 = XRange is not None or YRange is not None
    if check1 and any([dim in unsupported_dims for dim in od._ds.dims]):
        _warnings.warn(
            "\nHorizontal cutout not supported"
            "for moorings, surveys, and particles",
            stacklevel=2,
        )
        XRange = None
        YRange = None

    _check_instance(
        {
            "od": od,
            "add_Hbdr": add_Hbdr,
            "mask_outside": mask_outside,
            "timeFreq": timeFreq,
        },
        {
            "od": "oceanspy.OceanDataset",
            "add_Hbdr": "(float, int, bool)",
            "mask_outside": "bool",
            "timeFreq": ["type(None)", "str"],
        },
    )
    varList = _check_list_of_string(varList, "varList")
    YRange = _check_range(od, YRange, "YRange")
    XRange = _check_range(od, XRange, "XRange")
    ZRange = _check_range(od, ZRange, "ZRange")
    timeRange = _check_range(od, timeRange, "timeRange")
    sampMethod_list = ["snapshot", "mean"]

    if sampMethod not in sampMethod_list:
        raise ValueError("`sampMethod` [{}] is not supported."
                         "\nAvailable options: {}"
                         "".format(sampMethod, sampMethod_list))

    if not isinstance(dropAxes, bool):
        dropAxes = _check_list_of_string(dropAxes, "dropAxes")
        axes_warn = [axis for axis in dropAxes if axis not in od.grid_coords]
        if len(axes_warn) != 0:
            _warnings.warn(
                "\n{} are not axes of the oceandataset"
                "".format(axes_warn),
                stacklevel=2,
            )
            dropAxes = list(set(dropAxes) - set(axes_warn))
        dropAxes = {d: od.grid_coords[d] for d in dropAxes}
    elif dropAxes is True:
        dropAxes = od.grid_coords
        if YRange is None:
            dropAxes.pop("Y", None)
        if XRange is None:
            dropAxes.pop("X", None)
        if ZRange is None:
            dropAxes.pop("Z", None)
        if timeRange is None:
            dropAxes.pop("time", None)
    else:
        dropAxes = {}

    # Message
    print("Cutting out the oceandataset.")

    # Copy
    od = _copy.copy(od)

    # list for coord variables
    co_list = [var for var in od._ds.coords if var not in od._ds.dims]
    # Drop variables
    if varList is not None:
        # Make sure it's a list
        varList = list(varList)
        varList = varList + co_list
        varList = _rename_aliased(od, varList)

        # Compute missing variables
        od = _compute._add_missing_variables(od, varList)
        # Drop useless
        nvarlist = [v for v in od._ds.data_vars if v not in varList]
        od._ds = od._ds.drop_vars(nvarlist)
    else:  # this way, if applicable, llc_transf gets applied to all vars
        varList = [var for var in od._ds.reset_coords().data_vars]

    # Unpack
    ds = od._ds
    periodic = od.grid_periodic

    # ---------------------------
    # Time CUTOUT
    # ---------------------------
    # Initialize vertical mask
    maskT = _xr.ones_like(ds["time"]).astype("int")

    if timeRange is not None:

        # Use arrays
        timeRange = _np.asarray([_np.min(timeRange),
                                 _np.max(timeRange)]).astype(ds["time"].dtype)

        # Get the closest
        for i, time in enumerate(timeRange):
            if _np.issubdtype(ds["time"].dtype, _np.datetime64):
                diff = _np.fabs(ds["time"].astype("float64") -
                                time.astype("float64"))
            else:
                diff = _np.fabs(ds["time"] - time)
            timeRange[i] = ds["time"].where(diff == diff.min(),
                                            drop=True).min().values
        maskT = maskT.where(
            _np.logical_and(ds["time"] >= timeRange[0],
                            ds["time"] <= timeRange[-1]), 0)

        # Find time indexes
        maskT = maskT.assign_coords(time=_np.arange(len(maskT["time"])))
        dmaskT = maskT.where(maskT, drop=True)
        dtime = dmaskT["time"].values
        iT = [min(dtime), max(dtime)]
        maskT["time"] = ds["time"]

        # Indexis
        if iT[0] == iT[1]:
            if "time" not in dropAxes:
                if iT[0] > 0:
                    iT[0] = iT[0] - 1
                else:
                    iT[1] = iT[1] + 1
        else:
            dropAxes.pop("time", None)

        # Cutout
        ds = ds.isel(time=slice(iT[0], iT[1] + 1))
        if "time_midp" in ds.dims:
            if "time" in dropAxes:
                if iT[0] == len(ds["time_midp"]):
                    iT[0] = iT[0] - 1
                    iT[1] = iT[1] - 1
                ds = ds.isel(time_midp=slice(iT[0], iT[1] + 1))
            else:
                ds = ds.isel(time_midp=slice(iT[0], iT[1]))

    # ---------------------------
    # Vertical CUTOUT
    # ---------------------------
    # Initialize vertical mask
    maskV = _xr.ones_like(ds["Zp1"])

    if ZRange is not None:
        # Use arrays
        ZRange = _np.asarray(
            [_np.min(ZRange) - add_Vbdr,
             _np.max(ZRange) + add_Vbdr])
        ZRange = ZRange.astype(ds["Zp1"].dtype)

        # Get the closest
        for i, Z in enumerate(ZRange):
            diff = _np.fabs(ds["Zp1"] - Z)
            ZRange[i] = ds["Zp1"].where(diff == diff.min()).min().values
        maskV = maskV.where(
            _np.logical_and(ds["Zp1"] >= ZRange[0], ds["Zp1"] <= ZRange[-1]),
            0)

        # Find vertical indexes
        maskV = maskV.assign_coords(Zp1=_np.arange(len(maskV["Zp1"])))
        dmaskV = maskV.where(maskV, drop=True)
        dZp1 = dmaskV["Zp1"].values
        iZ = [_np.min(dZp1), _np.max(dZp1)]
        maskV["Zp1"] = ds["Zp1"]

        # Indexis
        if iZ[0] == iZ[1]:
            if "Z" not in dropAxes:
                if iZ[0] > 0:
                    iZ[0] = iZ[0] - 1
                else:
                    iZ[1] = iZ[1] + 1
        else:
            dropAxes.pop("Z", None)

        # Cutout
        ds = ds.isel(Zp1=slice(iZ[0], iZ[1] + 1))
        if "Z" in dropAxes:
            if iZ[0] == len(ds["Z"]):
                iZ[0] = iZ[0] - 1
                iZ[1] = iZ[1] - 1
            ds = ds.isel(Z=slice(iZ[0], iZ[1] + 1))
        else:
            ds = ds.isel(Z=slice(iZ[0], iZ[1]))

        if len(ds["Zp1"]) == 1:
            if "Zu" in ds.dims and len(ds["Zu"]) > 1:
                ds = ds.sel(Zu=ds["Zp1"].values, method="nearest")
            if "Zl" in ds.dims and len(ds["Zl"]) > 1:
                ds = ds.sel(Zl=ds["Zp1"].values, method="nearest")
        else:
            if "Zu" in ds.dims and len(ds["Zu"]) > 1:
                ds = ds.isel(Zu=slice(iZ[0], iZ[1]))
            if "Zl" in ds.dims and len(ds["Zl"]) > 1:
                ds = ds.isel(Zl=slice(iZ[0], iZ[1]))

    # ---------------------------
    # Horizontal CUTOUT (part I, split into two to avoid repeated code)
    # ---------------------------
    if add_Hbdr is True:
        add_Hbdr = _np.mean([
            _np.fabs(od._ds["XG"].max() - od._ds["XG"].min()),
            _np.fabs(od._ds["YG"].max() - od._ds["YG"].min()),
        ])
        add_Hbdr = add_Hbdr / _np.mean([len(od._ds["X"]), len(od._ds["Y"])])
    elif add_Hbdr is False:
        add_Hbdr = 0

    if add_Vbdr is True:
        add_Vbdr = _np.fabs(od._ds["Zp1"].diff("Zp1")).max().values
    elif add_Vbdr is False:
        add_Vbdr = 0

    # Initialize horizontal mask
    if XRange is not None or YRange is not None:

        maskH, dmaskH, XRange, YRange = get_maskH(ds, add_Hbdr, add_Vbdr,
                                                  XRange, YRange)

    if transformation is not False and "face" in ds.dims:
        if XRange is None and YRange is None:
            faces = "all"
        else:
            faces = dmaskH["face"].values  # gets faces that survives cutout
        _transf_list = ["arctic_crown", "arctic_centered"]
        if transformation in _transf_list:
            arg = {
                "ds": ds,
                "varlist": varList,  # vars and grid coords to transform
                "centered": centered,
                "faces": faces,
                "drop": True,  # required to calculate U-V grid points
            }
            if transformation == "arctic_crown":
                _transformation = _llc_trans.arctic_crown
            elif transformation == "arctic_centered":
                _transformation = _llc_trans.arctic_centered
            dsnew = _transformation(**arg)
            dsnew = dsnew.set_coords(co_list)
            grid_coords = od.grid_coords
            od._ds = dsnew
            manipulate_coords = {"coordsUVfromG": True}
            od = od.manipulate_coords(**manipulate_coords)
            if len(grid_coords["time"]) > 1:
                grid_coords["time"].pop("time_midp", None)
                grid_coords = {"add_midp": True, "grid_coords": grid_coords}
            od = od.set_grid_coords(**grid_coords, overwrite=True)
            od._ds.attrs["OceanSpy_description"] = "Cutout of"
            "simulation, with simple topology (face not a dimension)"
            # Unpack the new dataset without face as dimension
            ds = od._ds
            maskH, dmaskH, XRange, YRange = get_maskH(ds, add_Hbdr, add_Vbdr,
                                                      XRange, YRange)
        elif transformation not in _transf_list:
            raise ValueError("transformation not supported")
    elif transformation is False and "face" in ds.dims:
        raise ValueError("Must define a transformation to remove complex"
                         "topology of dataset.")

    # ---------------------------
    # Horizontal CUTOUT part II (continuation of original code)
    # ---------------------------

    if XRange is not None or YRange is not None:
        dYp1 = dmaskH["Yp1"].values
        dXp1 = dmaskH["Xp1"].values
        iY = [_np.min(dYp1), _np.max(dYp1)]
        iX = [_np.min(dXp1), _np.max(dXp1)]
        maskH["Yp1"] = ds["Yp1"]
        maskH["Xp1"] = ds["Xp1"]

        # Original length
        lenY = len(ds["Yp1"])
        lenX = len(ds["Xp1"])

        # Indexis
        if iY[0] == iY[1]:
            if "Y" not in dropAxes:
                if iY[0] > 0:
                    iY[0] = iY[0] - 1
                else:
                    iY[1] = iY[1] + 1
        else:
            dropAxes.pop("Y", None)

        if iX[0] == iX[1]:
            if "X" not in dropAxes:
                if iX[0] > 0:
                    iX[0] = iX[0] - 1
                else:
                    iX[1] = iX[1] + 1
        else:
            dropAxes.pop("X", None)

        ds = ds.isel(Yp1=slice(iY[0], iY[1] + 1), Xp1=slice(iX[0], iX[1] + 1))

        Xcoords = od._grid.axes["X"].coords
        if "X" in dropAxes:
            if iX[0] == len(ds["X"]):
                iX[0] = iX[0] - 1
                iX[1] = iX[1] - 1
            ds = ds.isel(X=slice(iX[0], iX[1] + 1))
        elif ("outer" in Xcoords
              and Xcoords["outer"] == "Xp1") or ("left" in Xcoords
                                                 and Xcoords["left"] == "Xp1"):
            ds = ds.isel(X=slice(iX[0], iX[1]))
        elif "right" in Xcoords and Xcoords["right"] == "Xp1":
            ds = ds.isel(X=slice(iX[0] + 1, iX[1] + 1))

        Ycoords = od._grid.axes["Y"].coords
        if "Y" in dropAxes:
            if iY[0] == len(ds["Y"]):
                iY[0] = iY[0] - 1
                iY[1] = iY[1] - 1
            ds = ds.isel(Y=slice(iY[0], iY[1] + 1))
        elif ("outer" in Ycoords
              and Ycoords["outer"] == "Yp1") or ("left" in Ycoords
                                                 and Ycoords["left"] == "Yp1"):
            ds = ds.isel(Y=slice(iY[0], iY[1]))
        elif "right" in Ycoords and Ycoords["right"] == "Yp1":
            ds = ds.isel(Y=slice(iY[0] + 1, iY[1] + 1))

        # Cut axis can't be periodic
        if (len(ds["Yp1"]) < lenY or "Y" in dropAxes) and "Y" in periodic:
            periodic.remove("Y")
        if (len(ds["Xp1"]) < lenX or "X" in dropAxes) and "X" in periodic:
            periodic.remove("X")

    # ---------------------------
    # Horizontal MASK
    # ---------------------------

    if mask_outside and (YRange is not None or XRange is not None):
        if YRange is not None:
            minY = YRange[0]
            maxY = YRange[1]
        else:
            minY = ds["YG"].min().values
            maxY = ds["YG"].max().values
        if XRange is not None:
            minX = XRange[0]
            maxX = XRange[1]
        else:
            minX = ds["XG"].min().values
            maxX = ds["XG"].max().values

        maskC = _xr.where(
            _np.logical_and(
                _np.logical_and(ds["YC"] >= minY, ds["YC"] <= maxY),
                _np.logical_and(ds["XC"] >= minX, ds["XC"] <= maxX),
            ),
            1,
            0,
        ).persist()
        maskG = _xr.where(
            _np.logical_and(
                _np.logical_and(ds["YG"] >= minY, ds["YG"] <= maxY),
                _np.logical_and(ds["XG"] >= minX, ds["XG"] <= maxX),
            ),
            1,
            0,
        ).persist()
        maskU = _xr.where(
            _np.logical_and(
                _np.logical_and(ds["YU"] >= minY, ds["YU"] <= maxY),
                _np.logical_and(ds["XU"] >= minX, ds["XU"] <= maxX),
            ),
            1,
            0,
        ).persist()
        maskV = _xr.where(
            _np.logical_and(
                _np.logical_and(ds["YV"] >= minY, ds["YV"] <= maxY),
                _np.logical_and(ds["XV"] >= minX, ds["XV"] <= maxX),
            ),
            1,
            0,
        ).persist()

        for var in ds.data_vars:
            if set(["X", "Y"]).issubset(ds[var].dims):
                ds[var] = ds[var].where(maskC, drop=True)
            elif set(["Xp1", "Yp1"]).issubset(ds[var].dims):
                ds[var] = ds[var].where(maskG, drop=True)
            elif set(["Xp1", "Y"]).issubset(ds[var].dims):
                ds[var] = ds[var].where(maskU, drop=True)
            elif set(["X", "Yp1"]).issubset(ds[var].dims):
                ds[var] = ds[var].where(maskV, drop=True)

    # ---------------------------
    # TIME RESAMPLING
    # ---------------------------
    # Resample in time
    if timeFreq:

        # Infer original frequency
        inFreq = _pd.infer_freq(ds.time.values)
        if timeFreq[0].isdigit() and not inFreq[0].isdigit():
            inFreq = "1" + inFreq

        # Same frequency: Skip
        if timeFreq == inFreq:
            _warnings.warn(
                "\nInput time freq:"
                "[{}] = Output time frequency: [{}]:"
                "\nSkip time resampling."
                "".format(inFreq, timeFreq),
                stacklevel=2,
            )

        else:

            # Remove time_midp and warn
            vars2drop = [
                var for var in ds.variables if "time_midp" in ds[var].dims
            ]
            if vars2drop:
                _warnings.warn(
                    "\nTime resampling drops variables"
                    " on `time_midp` dimension."
                    "\nDropped variables: {}."
                    "".format(vars2drop),
                    stacklevel=2,
                )
                ds = ds.drop_vars(vars2drop)

            # Snapshot
            if sampMethod == "snapshot":
                # Find new times
                time2sel = ds["time"].resample(time=timeFreq).first()
                newtime = ds["time"].sel(time=time2sel)

                # Use slice when possible
                inds = [
                    i for i, t in enumerate(ds["time"].values)
                    if t in newtime.values
                ]
                inds_diff = _np.diff(inds)
                if all(inds_diff == inds_diff[0]):
                    ds = ds.isel(time=slice(inds[0], inds[-1] +
                                            1, inds_diff[0]))
                else:
                    attrs = ds.attrs
                    ds = _xr.concat(
                        [ds.sel(time=time) for i, time in enumerate(newtime)],
                        dim="time",
                    )
                    ds.attrs = attrs

            else:
                # Mean
                # Separate time and timeless
                attrs = ds.attrs
                ds_dims = ds.drop_vars(
                    [var for var in ds.variables if var not in ds.dims])
                ds_time = ds.drop_vars([
                    var for var in ds.variables if "time" not in ds[var].dims
                ])
                ds_timeless = ds.drop_vars(
                    [var for var in ds.variables if "time" in ds[var].dims])

                # Resample
                ds_time = ds_time.resample(time=timeFreq).mean("time")

                # Add all dimensions to ds, and fix attributes
                for dim in ds_time.dims:
                    if dim == "time":
                        ds_time[dim].attrs = ds_dims[dim].attrs
                    else:
                        ds_time[dim] = ds_dims[dim]

                # Merge
                ds = _xr.merge([ds_time, ds_timeless])
                ds.attrs = attrs

    # Update oceandataset
    od._ds = ds

    # Add time midp
    if timeFreq and "time" not in dropAxes:
        od = od.set_grid_coords({
            **od.grid_coords, "time": {
                "time": -0.5
            }
        },
                                add_midp=True,
                                overwrite=True)

    # Drop axes
    grid_coords = od.grid_coords
    for coord in list(grid_coords):
        if coord in dropAxes:
            grid_coords.pop(coord, None)
    od = od.set_grid_coords(grid_coords, overwrite=True)

    # Cut axis can't be periodic
    od = od.set_grid_periodic(periodic)

    return od
Example #25
0
def roc(
    observations,
    forecasts,
    bin_edges="continuous",
    dim=None,
    drop_intermediate=False,
    return_results="area",
):
    """Computes the relative operating characteristic for a range of thresholds.

    Parameters
    ----------
    observations : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
        If ``bin_edges=='continuous'``, observations are binary.
    forecasts : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
        If ``bin_edges=='continuous'``, forecasts are probabilities.
    bin_edges : array_like, str, default='continuous'
        Bin edges for categorising observations and forecasts. Similar to np.histogram, \
        all but the last (righthand-most) bin include the left edge and exclude the \
        right edge. The last bin includes both edges. ``bin_edges`` will be sorted in \
        ascending order. If ``bin_edges=='continuous'``, calculate ``bin_edges`` from \
        forecasts, equal to ``sklearn.metrics.roc_curve(f_boolean, o_prob)``.
    dim : str, list
        The dimension(s) over which to compute the contingency table
    drop_intermediate : bool, default=False
        Whether to drop some suboptimal thresholds which would not appear on a plotted
        ROC curve. This is useful in order to create lighter ROC curves.
        Defaults to ``True`` in ``sklearn.metrics.roc_curve``.
    return_results: str, default='area'
        Specify how return is structed:

            - 'area': return only the ``area under curve`` of ROC

            - 'all_as_tuple': return ``true positive rate`` and ``false positive rate``
              at each bin and area under the curve of ROC as tuple

            - 'all_as_metric_dim': return ``true positive rate`` and
              ``false positive rate`` at each bin and ``area under curve`` of ROC
              concatinated into new ``metric`` dimension

    Returns
    -------
    xarray.Dataset or xarray.DataArray :
        reduced by dimensions ``dim``, see ``return_results`` parameter.
        ``true positive rate`` and ``false positive rate`` contain
        ``probability_bin`` dimension with ascending ``bin_edges`` as coordinates.


    Examples
    --------
    >>> f = xr.DataArray(
    ...     np.random.normal(size=(1000)),
            coords=[('time', np.arange(1000))]
    ... )
    >>> o = xr.DataArray(
    ...    np.random.normal(size=(1000)),
    ...    coords=[('time', np.arange(1000))]
    ... )
    >>> category_edges = np.linspace(-2, 2, 5)
    >>> roc(o, f, category_edges, dim=['time'])
    <xarray.DataArray 'histogram_observations_forecasts' ()>
    array(0.46812223)

    See also
    --------
    xskillscore.Contingency
    sklearn.metrics.roc_curve

    References
    ----------
    http://www.cawcr.gov.au/projects/verification/
    """

    if dim is None:
        dim = list(forecasts.dims)
    if isinstance(dim, str):
        dim = [dim]

    continuous = False
    if isinstance(bin_edges, str):
        if bin_edges == "continuous":
            continuous = True
            # check that o binary
            if isinstance(observations, xr.Dataset):
                o_check = observations.to_array()
            else:
                o_check = observations
            if str(o_check.dtype) != "bool":
                if not ((o_check == 0) | (o_check == 1)).all():
                    raise ValueError(
                        'Input "observations" must represent logical (True/False) outcomes',
                        o_check,
                    )

            # works only for 1var
            if isinstance(forecasts, xr.Dataset):
                varlist = list(forecasts.data_vars)
                if len(varlist) == 1:
                    v = varlist[0]
                else:
                    raise ValueError(
                        "Only works for `xr.Dataset` with one variable, found"
                        f"{forecasts.data_vars}. Considering looping over `data_vars`"
                        "or `.to_array()`.")
                f_bin = forecasts[v]
            else:
                f_bin = forecasts
            f_bin = f_bin.stack(ndim=forecasts.dims)
            f_bin = f_bin.sortby(-f_bin)
            bin_edges = np.append(f_bin[0] + 1, f_bin)
            bin_edges = np.unique(bin_edges)  # ensure that in ascending order
        else:
            raise ValueError("If bin_edges is str, it can only be continuous.")
    else:
        bin_edges = np.sort(bin_edges)  # ensure that in ascending order

    # loop over each bin_edge and get true positive rate and false positive rate
    # from contingency
    tpr, fpr = [], []
    for i in bin_edges:
        dichotomous_category_edges = np.array(
            [-np.inf, i, np.inf])  # "dichotomous" means two-category
        dichotomous_contingency = Contingency(
            observations,
            forecasts,
            dichotomous_category_edges,
            dichotomous_category_edges,
            dim=dim,
        )
        fpr.append(dichotomous_contingency.false_alarm_rate())
        tpr.append(dichotomous_contingency.hit_rate())
    tpr = xr.concat(tpr, "probability_bin")
    fpr = xr.concat(fpr, "probability_bin")
    tpr["probability_bin"] = bin_edges
    fpr["probability_bin"] = bin_edges

    fpr = fpr.fillna(1.0)
    tpr = tpr.fillna(0.0)

    # pad (0,0) and (1,1)
    fpr_pad = xr.concat(
        [
            xr.ones_like(fpr.isel(probability_bin=0, drop=False)),
            fpr,
            xr.zeros_like(fpr.isel(probability_bin=-1, drop=False)),
        ],
        "probability_bin",
    )
    tpr_pad = xr.concat(
        [
            xr.ones_like(tpr.isel(probability_bin=0, drop=False)),
            tpr,
            xr.zeros_like(tpr.isel(probability_bin=-1, drop=False)),
        ],
        "probability_bin",
    )

    if drop_intermediate and fpr.probability_bin.size > 2:

        fpr, tpr = _drop_intermediate(fpr, tpr)
        fpr_pad, tpr_pad = _drop_intermediate(fpr_pad, tpr_pad)

    area = _auc(fpr_pad, tpr_pad)

    if continuous:
        # sklearn returns in reversed order
        fpr = fpr.sortby(-fpr.probability_bin)
        tpr = tpr.sortby(-fpr.probability_bin)

    # mask always nan
    def _keep_masked(new, ori, dim):
        """Keep mask from `ori` deprived of dimensions from `dim` in input `new`."""
        isel_dim = {d: 0 for d in forecasts.dims if d in dim}
        mask = ori.isel(isel_dim, drop=True)
        new_masked = new.where(mask.notnull())
        return new_masked

    fpr = _keep_masked(fpr, forecasts, dim=dim)
    tpr = _keep_masked(tpr, forecasts, dim=dim)
    area = _keep_masked(area, forecasts, dim=dim)

    if return_results == "area":
        return area
    elif return_results == "all_as_metric_dim":
        results = xr.concat([fpr, tpr, area], "metric", coords="minimal")
        results["metric"] = [
            "false positive rate",
            "true positive rate",
            "area under curve",
        ]
        return results
    elif return_results == "all_as_tuple":
        return fpr, tpr, area
    else:
        raise NotImplementedError(
            "expect `return_results` from [all_as_tuple, area, all_as_metric_dim], "
            f"found {return_results}")
Example #26
0
def add_bias_column( x_data: xa.DataArray )-> xa.DataArray:
    return xa.concat([ x_data, xa.ones_like(x_data[:,0]) ], x_data.dims[1] )
Example #27
0
def rps(
    observations,
    forecasts,
    category_edges,
    dim=None,
    fair=False,
    weights=None,
    keep_attrs=False,
    member_dim="member",
):
    """Calculate Ranked Probability Score.

     .. math::
        RPS = \\sum_{m=1}^{M}[(\\sum_{k=1}^{m} y_k) - (\\sum_{k=1}^{m} o_k)]^{2}

    where ``y`` and ``o`` are forecast and observation probabilities in ``M``
    categories.

     .. note::
        Takes the sum over all categories as in Weigel et al. 2007 and not the mean as
        in https://www.cawcr.gov.au/projects/verification/verif_web_page.html#RPS.
        Therefore RPS has no upper boundary.

    Parameters
    ----------
    observations : xarray.Dataset or xarray.DataArray
        The observations of the event.
        Further requirements are specified based on ``category_edges``.
    forecasts : xarray.Dataset or xarray.DataArray
        The forecast of the event with dimension specified by ``member_dim``.
        Further requirements are specified based on ``category_edges``.
    category_edges : array_like, xr.Dataset, xr.DataArray, None
        Edges (left-edge inclusive) of the bins used to calculate the cumulative
        density function (cdf). Note that here the bins have to include the full range
        of observations and forecasts data. Effectively, negative infinity is appended
        to the left side of category_edges, and positive infinity is appended to the
        right side. Thus, N category edges produces N+1 bins. For example, specifying
        category_edges = [0,1] will compute the RPS for bins [-inf, 0), [0, 1) and
        [1, inf), which results in CDF bins [-inf, 0), [-inf, 1) and [-inf, inf).
        Note that the edges are right-edge exclusive.
        Forecasts, observations and category_edge are expected
        in absolute units or probabilities consistently.

        - np.array (1d): will be internally converted and broadcasted to observations.
          Use this if you wish to use the same category edges for all elements of both
          forecasts and observations.

        - xr.Dataset/xr.DataArray: edges of the categories provided
          as dimension ``category_edge`` with optional category labels as
          ``category_edge`` coordinate. Use xr.Dataset/xr.DataArray if edges
          multi-dimensional and vary across dimensions. Use this if your category edges
          vary across dimensions of forecasts and observations, but are the same for
          both.

        - tuple of np.array/xr.Dataset/xr.DataArray: same as above, where the
          first item is taken as ``category_edges`` for observations and the second item
          for ``category_edges`` for forecasts. Use this if your category edges vary
          across dimensions of forecasts and observations, and are different for each.

        - None: expect than observations and forecasts are already CDFs containing
          ``category_edge`` dimension. Use this if your category edges vary across
          dimensions of forecasts and observations, and are different for each.

    dim : str or list of str, optional
        Dimension over which to mean after computing ``rps``. This represents a mean
        over multiple forecasts-observations pairs. Defaults to None implying averaging
        over all dimensions.
    fair: boolean
        Apply ensemble member-size adjustment for unbiased, fair metric; see Ferro (2013).
    weights : xr.DataArray with dimensions from dim, optional
        Weights for `weighted.mean(dim)`. Defaults to None, such that no weighting is
        applied.
    keep_attrs : bool
        If True, the attributes (attrs) will be copied from the first input to the new
        one. If False (default), the new object will be returned without attributes.
    member_dim : str, optional
        Name of ensemble member dimension. By default, 'member'.

    Returns
    -------
    xarray.Dataset or xarray.DataArray:
        ranked probability score with coords ``forecasts_category_edge`` and
        ``observations_category_edge`` as str


    Examples
    --------
    >>> observations = xr.DataArray(np.random.random(size=(3, 3)),
    ...                             coords=[('x', np.arange(3)),
    ...                                     ('y', np.arange(3))])
    >>> forecasts = xr.DataArray(np.random.random(size=(3, 3, 3)),
    ...                          coords=[('x', np.arange(3)),
    ...                                  ('y', np.arange(3)),
    ...                                  ('member', np.arange(3))])
    >>> category_edges = np.array([.33, .66])
    >>> xs.rps(observations, forecasts, category_edges, dim='x')
    <xarray.DataArray (y: 3)>
    array([0.14814815, 0.7037037 , 1.51851852])
    Coordinates:
      * y                           (y) int64 0 1 2
        forecasts_category_edge     <U38 '[-np.inf, 0.33), [0.33, 0.66), [0.66, np.inf]'
        observations_category_edge  <U38 '[-np.inf, 0.33), [0.33, 0.66), [0.66, np.inf]'


    You can also define multi-dimensional ``category_edges``, e.g. with xr.quantile.
    However, you still need to ensure that ``category_edges`` covers the forecasts
    and observations distributions.

    >>> category_edges = observations.quantile(
    ...     q=[.33, .66]).rename({'quantile': 'category_edge'}),
    >>> xs.rps(observations, forecasts, category_edges, dim='x')
    <xarray.DataArray (y: 3)>
    array([1.18518519, 0.85185185, 0.40740741])
    Coordinates:
      * y                           (y) int64 0 1 2
        forecasts_category_edge     <U38 '[-np.inf, 0.33), [0.33, 0.66), [0.66, np.inf]'
        observations_category_edge  <U38 '[-np.inf, 0.33), [0.33, 0.66), [0.66, np.inf]'

    References
    ----------
    * Weigel, A. P., Liniger, M. A., & Appenzeller, C. (2007). The Discrete Brier and
      Ranked Probability Skill Scores. Monthly Weather Review, 135(1), 118–124.
      doi: 10/b59qz5
    * C. A. T. Ferro. Fair scores for ensemble forecasts. Q.R.J. Meteorol. Soc., 140:
      1917–1923, 2013. doi: 10.1002/qj.2270.
    * https://www-miklip.dkrz.de/about/problems/

    """
    bin_dim = "category_edge"
    if member_dim not in forecasts.dims:
        raise ValueError(
            f"Expect to find {member_dim} in forecasts dimensions, found"
            f"{forecasts.dims}.")
    if fair:
        M = forecasts[member_dim].size

    forecasts = _bool_to_int(forecasts)

    _check_identical_xr_types(observations, forecasts)

    # different entry point of calculating RPS based on category_edges
    # category_edges tuple of two: use for obs and forecast category_edges separately
    if isinstance(category_edges,
                  (tuple, np.ndarray, xr.DataArray, xr.Dataset)):
        if isinstance(category_edges, tuple):
            assert isinstance(category_edges[0], type(category_edges[1]))
            observations_edges, forecasts_edges = category_edges
        else:  # category_edges only given once, so use for both obs and forecasts
            observations_edges, forecasts_edges = category_edges, category_edges

        if isinstance(observations_edges, np.ndarray):
            # convert category_edges as xr object
            observations_edges = xr.DataArray(
                observations_edges,
                dims="category_edge",
                coords={"category_edge": observations_edges},
            )
            observations_edges = xr.ones_like(
                observations) * observations_edges

            forecasts_edges = xr.DataArray(
                forecasts_edges,
                dims="category_edge",
                coords={"category_edge": forecasts_edges},
            )
            forecasts_edges = (
                xr.ones_like(forecasts if member_dim not in forecasts.dims else
                             forecasts.isel({member_dim: 0}, drop=True)) *
                forecasts_edges)

        _check_identical_xr_types(forecasts_edges, forecasts)
        _check_identical_xr_types(observations_edges, forecasts)

        # cumulative probability functions
        # lowest category is [-np.inf, category_edges.isel(category_edge=0)]
        # ignores the right-most edge. The effective right-most edge is np.inf.
        # therefore the CDFs Fc and Oc both reach 1 for the right-most edge.
        # < makes edges right-edge exclusive
        Fc = (forecasts < forecasts_edges).mean(member_dim)
        Oc = (observations < observations_edges).astype("int")

    elif category_edges is None:  # expect CDFs already as inputs
        if member_dim in forecasts.dims:
            forecasts = forecasts.mean(member_dim)
        Fc = forecasts
        Oc = observations
    else:
        raise ValueError(
            "category_edges must be xr.DataArray, xr.Dataset, tuple of xr.objects, "
            f" None or array-like, found {type(category_edges)}")

    # RPS formulas
    if fair:  # for ensemble member adjustment, see Ferro 2013
        Ec = Fc * M
        res = ((Ec / M - Oc)**2 - Ec * (M - Ec) / (M**2 *
                                                   (M - 1))).sum(bin_dim)
    else:  # normal formula
        res = ((Fc - Oc)**2).sum(bin_dim)

    # add category_edge as str into coords
    if category_edges is not None:
        res = _assign_rps_category_bounds(res, observations_edges,
                                          "observations")
        res = _assign_rps_category_bounds(res, forecasts_edges, "forecasts")
    if weights is not None:
        res = res.weighted(weights)
    # combine many forecasts-observations pairs
    res = res.mean(dim)
    # keep nans and prevent 0 for all nan grids
    res = _keep_nans_masked(observations, res, dim, ignore=["category_edge"])
    if keep_attrs:  # attach by hand
        res.attrs.update(observations.attrs)
        res.attrs.update(forecasts.attrs)
        if isinstance(res, xr.Dataset):
            for v in res.data_vars:
                res[v].attrs.update(observations[v].attrs)
                res[v].attrs.update(forecasts[v].attrs)
    return res
def merged_mask(
    basins, ds, lon_name="lon", lat_name="lat", merge_dict=None, verbose=False
):
    """Combine geographical basins (from regionmask) to larger ocean basins.

    Parameters
    ----------
    basins : regionmask.core.regions.Regions object
        Loaded basin data from regionmask, e.g. `import regionmask;basins = regionmask.defined_regions.natural_earth.ocean_basins_50`
    ds : xr.Dataset
        Input dataset on which to construct the mask
    lon_name : str, optional
        Name of the longitude coordinate in `ds`, defaults to `lon`
    lat_name : str, optional
        Name of the latitude coordinate in `ds`, defaults to `lat`
    merge_dict : dict, optional
        dictionary defining new aggregated regions (as keys) and the regions to be merge into that region as as values (list of names).
        Defaults to large scale ocean basins defined by `cmip6_preprocessing.regionmask.default_merge_dict`
    verbose : bool, optional
       Prints more output, e.g. the regions in `basins` that were not used in the merging step. Defaults to False.

    Returns
    -------
    mask : xr.DataArray
        The mask contains ascending numeric value for each key ( merged region) in `merge_dict`.
        When the default is used the numeric values correspond to the following regions:
        * 0: North Atlantic

        * 1: South Atlantic

        * 2: North Pacific

        * 3: South Pacific

        * 4: Maritime Continent

        * 5: Indian Ocean

        * 6: Arctic Ocean

        * 7: Southern Ocean

        * 8: Black Sea

        * 9: Mediterranean Sea

        *10: Red Sea

        *11: Caspian Sea

    """
    mask = basins.mask(ds, lon_name=lon_name, lat_name=lat_name)

    def find_mask_index(name):
        target_value = [
            ri for ri in range(len(basins.regions)) if basins.regions[ri].name == name
        ]
        if len(target_value) > 1:
            warnings.warn(f"Found more than one matching region for {name}")
            return target_value[0]
        elif len(target_value) == 1:
            return target_value[0]
        else:
            return None

    if merge_dict is None:
        merge_dict = _default_merge_dict()

    dict_keys = list(merge_dict.keys())
    number_dict = {k: None for k in dict_keys}
    merged_basins = []
    for ocean, small_basins in merge_dict.items():
        #         ocean_idx = find_mask_index(ocean)
        try:
            ocean_idx = basins.map_keys(ocean)
        except (KeyError):
            # The ocean key is new and cant be found in the previous keys (e.g. for Atlantic full or maritime continent)
            ocean_idx = mask.max().data + 1
        number_dict[ocean] = ocean_idx
        if small_basins:
            for sb in small_basins:
                sb_idx = basins.map_keys(sb)
                # set the index of each small basin to the ocean value
                mask = mask.where(mask != sb_idx, ocean_idx)
                merged_basins.append(sb)

    if verbose:
        remaining_basins = [
            str(basins.regions[ri].name)
            for ri in range(len(basins.regions))
            if (basins.regions[ri].name not in merged_basins)
            and (basins.regions[ri].name not in list(merge_dict.keys()))
        ]
        print(remaining_basins)

    # reset the mask indicies to the order of the passed dictionary keys
    mask_reordered = xr.ones_like(mask.copy()) * np.nan
    for new_idx, k in enumerate(dict_keys):
        old_idx = number_dict[k]
        mask_reordered = mask_reordered.where(mask != old_idx, new_idx)

    return mask_reordered
    def augment_static_data(
        dynamic_ds: xr.Dataset,
        static_ds: xr.Dataset,
        test_year: Optional[List[str]] = None,
        dynamic_ignore_vars: List[str] = None,
        global_means: bool = True,
        spatial_means: bool = True,
    ) -> xr.Dataset:
        """ Create our own aggregations from the dynamic data

        NOTE: unnecessary for CAMELS because this data can
        just be taken from pre-computed means
        """
        # get the minimum test_year
        if isinstance(test_year, Iterable):
            test_year = min(test_year)

        # PREVENT temporal leakage of information
        min_test_date = pd.to_datetime(f"{test_year}-01-01")
        max_train_date = min_test_date - Day(1)
        min_train_date = pd.to_datetime(dynamic_ds.time.min().values)
        dynamic_ds = dynamic_ds.sel(time=slice(min_train_date, max_train_date))

        # augment the static data with the variables from dynamic_ds
        original_vars = list(dynamic_ds.data_vars)
        if dynamic_ignore_vars is not None:
            vars_list = [
                v for v in original_vars if v not in dynamic_ignore_vars
            ]
        else:
            vars_list = original_vars

        print("Augmenting the static data with"
              f" {'global_means' if global_means else ''}"
              f" {'spatial_means' if spatial_means else ''}"
              f"for variables: {vars_list}")

        # check they have the same coords and dtypes
        reference_coord = [c for c in static_ds.coords][0]
        assert reference_coord in list(
            dynamic_ds.coords), (f"Static: {list(static_ds.coords)}"
                                 f" Dynamic: {list(dynamic_ds.coords)}")
        assert static_ds[reference_coord].dtype == dynamic_ds[
            reference_coord].dtype, (
                f"Static: {static_ds[reference_coord].dtype}"
                f" Dynamic: {dynamic_ds[reference_coord].dtype}")

        # calculate ones same shape as the static data
        first_var = list(static_ds.data_vars)[0]
        ones = xr.ones_like(static_ds[first_var])

        # for each NON-IGNORED dynamic variable calculate global_mean / spatial_mean
        list_data_arrays: List[xr.DataArray] = []
        for var in vars_list:
            if global_means:
                # GLOBAL mean
                global_mean_values = dynamic_ds[var].mean()
                global_mean_da = (global_mean_values *
                                  ones).rename(f"{var}_global_mean")
                list_data_arrays.append(global_mean_da)
            if spatial_means:
                # spatial mean
                spatial_mean_values = dynamic_ds[var].mean(dim="time")
                spatial_mean_da = (spatial_mean_values *
                                   ones).rename(f"{var}_spatial_mean")
                list_data_arrays.append(spatial_mean_da)

        if list_data_arrays != []:
            # join these new calculated variables into the original
            ds = xr.combine_by_coords(list_data_arrays)
            static_ds = static_ds.merge(ds)

        return static_ds
Example #30
0
def arct_connect(ds, varName, faces="all"):

    arc_cap = 6
    Nx_ac_nrot = []
    Ny_ac_nrot = []
    Nx_ac_rot = []
    Ny_ac_rot = []
    ARCT = [0, 0, 0, 0]  # initialize the list.
    arc_faces = [0, 0, 0, 0]
    metrics = [
        "dxC",
        "dyC",
        "dxG",
        "dyG",
        "hFacW",
        "hFacS",
    ]  # metric variables defined at vector points

    if isinstance(faces, str):
        if faces == "all":
            faces = [k for k in range(13)]

    if arc_cap in faces:
        for k in faces:
            if k == 2:
                fac = 1
                arc_faces[0] = k
                _varName = varName
                DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                dims = Dims(DIMS[::-1])
                dtr = list(dims)[::-1]
                dtr[-1], dtr[-2] = dtr[-2], dtr[-1]
                mask2 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                # TODO: Eval where, define argument outside
                mask2 = mask2.where(
                    _np.logical_and(
                        ds[dims.X] < ds[dims.Y],
                        ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y],
                    ))
                x0, xf = 0, int(len(ds[dims.Y]) / 2)  # TODO: CHECK here!
                y0, yf = 0, int(len(ds[dims.X]))
                xslice = slice(x0, xf)
                yslice = slice(y0, yf)
                Nx_ac_nrot.append(0)
                Ny_ac_nrot.append(len(ds[dims.Y][y0:yf]))
                da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                mask_arg = {dims.X: xslice, dims.Y: yslice}
                if len(dims.X) + len(dims.Y) == 4:
                    if len(dims.Y) == 3 and _varName not in metrics:
                        fac = -1
                arct = fac * ds[_varName].isel(**da_arg)
                Mask = mask2.isel(**mask_arg)
                arct = arct * Mask
                ARCT[0] = arct

            elif k == 5:
                fac = 1
                arc_faces[1] = k
                _varName = varName
                DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                dims = Dims(DIMS[::-1])
                mask5 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                mask5 = mask5.where(
                    _np.logical_and(
                        ds[dims.X] > ds[dims.Y],
                        ds[dims.X] < len(ds[dims.Y]) - ds[dims.Y],
                    ))
                x0, xf = 0, int(len(ds[dims.X]))
                y0, yf = 0, int(len(ds[dims.Y]) / 2)
                xslice = slice(x0, xf)
                yslice = slice(y0, yf)
                Nx_ac_nrot.append(0)
                Ny_ac_nrot.append(len(ds[dims.X][y0:yf]))
                if len(dims.X) + len(dims.Y) == 4:
                    if len(dims.Y) == 1 and _varName not in metrics:
                        fac = -1
                da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                mask_arg = {dims.X: xslice, dims.Y: yslice}
                arct = ds[_varName].isel(**da_arg)
                Mask = mask5.isel(**mask_arg)
                arct = arct * Mask
                ARCT[1] = arct

            elif k == 7:
                fac = 1
                arc_faces[2] = k
                _varName = varName
                DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                dims = Dims(DIMS[::-1])
                dtr = list(dims)[::-1]
                dtr[-1], dtr[-2] = dtr[-2], dtr[-1]
                mask7 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                mask7 = mask7.where(
                    _np.logical_and(
                        ds[dims.X] > ds[dims.Y],
                        ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y],
                    ))
                x0, xf = int(len(ds[dims.Y]) / 2), int(len(ds[dims.Y]))
                y0, yf = 0, int(len(ds[dims.X]))
                xslice = slice(x0, xf)
                yslice = slice(y0, yf)
                Nx_ac_rot.append(len(ds[dims.Y][x0:xf]))
                Ny_ac_rot.append(0)
                da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                mask_arg = {dims.X: xslice, dims.Y: yslice}
                arct = fac * ds[_varName].isel(**da_arg)
                Mask = mask7.isel(**mask_arg)
                arct = arct * Mask
                ARCT[2] = arct

            elif k == 10:
                fac = 1
                _varName = varName
                DIMS = [dim for dim in ds[_varName].dims if dim != "face"]
                dims = Dims(DIMS[::-1])
                dtr = list(dims)[::-1]
                dtr[-1], dtr[-2] = dtr[-2], dtr[-1]
                arc_faces[3] = k
                mask10 = _xr.ones_like(ds[_varName].isel(face=arc_cap))
                mask10 = mask10.where(
                    _np.logical_and(
                        ds[dims.X] < ds[dims.Y],
                        ds[dims.X] > len(ds[dims.Y]) - ds[dims.Y],
                    ))
                x0, xf = 0, int(len(ds[dims.X]))
                y0, yf = int(len(ds[dims.Y]) / 2), int(len(ds[dims.Y]))
                xslice = slice(x0, xf)
                yslice = slice(y0, yf)
                Nx_ac_rot.append(0)
                Ny_ac_rot.append(len(ds[dims.Y][y0:yf]))
                da_arg = {"face": arc_cap, dims.X: xslice, dims.Y: yslice}
                mask_arg = {dims.X: xslice, dims.Y: yslice}
                arct = fac * ds[_varName].isel(**da_arg)
                Mask = mask10.isel(**mask_arg)
                arct = (arct * Mask).transpose(*dtr)
                ARCT[3] = arct

    return arc_faces, Nx_ac_nrot, Ny_ac_nrot, Nx_ac_rot, Ny_ac_rot, ARCT
Example #31
0
for i in range(len(ds["time"])):
    T = T + [np.datetime64(t0) + np.timedelta64(int(i * step * 1.0e3), "ms")]
ds["time"] = np.array(T, dtype="datetime64")
T = []
for i in range(len(ds["time_midp"])):
    T = T + [np.datetime64(t0) + np.timedelta64(int(i * step * 1.0e3), "ms")]
ds["time_midp"] = np.array(T, dtype="datetime64") + np.timedelta64(
    int(0.5 * step * 1.0e3), "ms")

# deltas
for var in ["drF", "dxC", "dyC", "dxF", "dyF", "dxG", "dyG", "dxV", "dyU"]:
    ds[var] = xr.full_like(ds[var], step)
for var in ["rA", "rAw", "rAs", "rAz"]:
    ds[var] = xr.full_like(ds[var], step**2)
for var in ["HFacC", "HFacW", "HFacS"]:
    ds[var] = xr.ones_like(ds[var])

# Recreate oceandataset
od4calc = OceanDataset(ds)

# Gradient
sinX = xr.zeros_like(od4calc.dataset["Temp"]) + np.sin(od4calc.dataset["XC"])
sinY = xr.zeros_like(od4calc.dataset["Temp"]) + np.sin(od4calc.dataset["YC"])
sinZ = xr.zeros_like(od4calc.dataset["Temp"]) + np.sin(od4calc.dataset["Z"])
sintime = xr.zeros_like(od4calc.dataset["Temp"]) + np.sin(
    (od4calc.dataset["time"] - od4calc.dataset["time"][0]) /
    np.timedelta64(1, "s"))

sintime.attrs = od4calc.dataset["time"].attrs
cosX = xr.zeros_like(od4calc.dataset["U"]) + np.cos(od4calc.dataset["XU"])
cosY = xr.zeros_like(od4calc.dataset["V"]) + np.cos(od4calc.dataset["YV"])