def low_q_freq(da: DataArray, coord: str = "date", threshold: float = 0.2) -> float: # determine the date of the first January 1st in the data period first_date = da.coords[coord][0].values.astype("datetime64[s]").astype(datetime) last_date = da.coords[coord][-1].values.astype("datetime64[s]").astype(datetime) if first_date == datetime.strptime(f"{first_date.year}-01-01", "%Y-%m-%d"): start_date = first_date else: start_date = datetime.strptime(f"{first_date.year + 1}-01-01", "%Y-%m-%d") # end date of the first full year period end_date = start_date + relativedelta(years=1) - relativedelta(days=1) # determine the mean flow over the entire period mean_flow = da.mean(skipna=True) lqfs = [] while end_date < last_date: data = da.sel({coord: slice(start_date, end_date)}) # number of days with discharge lower than threshold * median in a one year period n_days = (data < (threshold * mean_flow)).sum() lqfs.append(float(n_days)) start_date += relativedelta(years=1) end_date += relativedelta(years=1) return np.mean(lqfs)
def test_date_range_like_errors(): src = date_range("1899-02-03", periods=20, freq="D", use_cftime=False) src = src[np.arange(20) != 10] # Remove 1 day so the frequency is not inferable. with pytest.raises( ValueError, match="`date_range_like` was unable to generate a range as the source frequency was not inferable.", ): date_range_like(src, "gregorian") src = DataArray( np.array( [["1999-01-01", "1999-01-02"], ["1999-01-03", "1999-01-04"]], dtype=np.datetime64, ), dims=("x", "y"), ) with pytest.raises( ValueError, match="'source' must be a 1D array of datetime objects for inferring its range.", ): date_range_like(src, "noleap") da = DataArray([1, 2, 3, 4], dims=("time",)) with pytest.raises( ValueError, match="'source' must be a 1D array of datetime objects for inferring its range.", ): date_range_like(da, "noleap")
def _mask_valid(obs: DataArray, sim: DataArray) -> Tuple[DataArray, DataArray]: # mask of invalid entries. NaNs in simulations can happen during validation/testing idx = (~sim.isnull()) & (~obs.isnull()) obs = obs[idx] sim = sim[idx] return obs, sim
def beta_kge(obs: DataArray, sim: DataArray) -> float: # verify inputs _validate_inputs(obs, sim) # get time series with only valid observations obs, sim = _mask_valid(obs, sim) return float(sim.mean() / obs.mean())
def alpha_nse(obs: DataArray, sim: DataArray) -> float: # verify inputs _validate_inputs(obs, sim) # get time series with only valid observations obs, sim = _mask_valid(obs, sim) return float(sim.std() / obs.std())
def kge(obs: DataArray, sim: DataArray, weights: List[float] = [1., 1., 1.]) -> float: r"""Calculate the Kling-Gupta Efficieny [#]_ .. math:: \text{KGE} = 1 - \sqrt{[ s_r (r - 1)]^2 + [s_\alpha ( \alpha - 1)]^2 + [s_\beta(\beta_{\text{KGE}} - 1)]^2}, where :math:`r` is the correlation coefficient, :math:`\alpha` the :math:`\alpha`-NSE decomposition, :math:`\beta_{\text{KGE}}` the fraction of the means and :math:`s_r, s_\alpha, s_\beta` the corresponding weights (here the three float values in the `weights` parameter). Parameters ---------- obs : DataArray Observed time series. sim : DataArray Simulated time series. weights : List[float] Weighting factors of the 3 KGE parts, by default each part has a weight of 1. Returns ------- float Kling-Gupta Efficiency References ---------- .. [#] Gupta, H. V., Kling, H., Yilmaz, K. K., & Martinez, G. F. (2009). Decomposition of the mean squared error and NSE performance criteria: Implications for improving hydrological modelling. Journal of hydrology, 377(1-2), 80-91. """ if len(weights) != 3: raise ValueError("Weights of the KGE must be a list of three values") # verify inputs _validate_inputs(obs, sim) # get time series with only valid observations obs, sim = _mask_valid(obs, sim) if len(obs) < 2: return np.nan r, _ = stats.pearsonr(obs.values, sim.values) alpha = sim.std() / obs.std() beta = sim.mean() / obs.mean() value = (weights[0] * (r - 1)**2 + weights[1] * (alpha - 1)**2 + weights[2] * (beta - 1)**2) return 1 - np.sqrt(float(value))
def normalize_labels(da: DataArray): # ...infer what axis the "time" axis actually is da = da.rename({"time": "E"}) # ...reshape flattened axes ... # ...replace the "time" axis' coords with more meaningful thing da = da.assign_coords({"E": range(da.shape[0])}) return da
def runoff_ratio(da: DataArray, prcp: DataArray) -> float: # get precip coordinate name (to avoid problems with 'index' or 'date') coord_name = list(prcp.coords.keys())[0] # slice prcp to the same time window as the discharge prcp = prcp.sel({coord_name: slice(da.coords["date"][0], da.coords["date"][-1])}) # calculate runoff ratio value = da.mean() / prcp.mean() return float(value)
def build_bootstrap_year_da(da: DataArray, groups: Dict[Any, slice], label: Any, dim: str = "time") -> DataArray: """Return an array where a group in the original is replaced by every other groups along a new dimension. Parameters ---------- da : DataArray Original input array over reference period. groups : dict Output of grouping functions, such as `DataArrayResample.groups`. label : Any Key identifying the group item to replace. dim : str Dimension recognized as time. Default: `time`. Returns ------- DataArray: Array where one group is replaced by values from every other group along the `bootstrap` dimension. """ gr = groups.copy() # Location along dim that must be replaced bloc = da[dim][gr.pop(label)] # Initialize output array with new bootstrap dimension out = da.expand_dims({BOOTSTRAP_DIM: np.arange(len(gr))}).copy(deep=True) # With dask, mutating the views of out is not working, thus the accumulator out_accumulator = [] # Replace `bloc` by every other group for i, (key, group_slice) in enumerate(gr.items()): source = da.isel({dim: group_slice}) out_view = out.loc[{BOOTSTRAP_DIM: i}] if len(source[dim]) < 360 and len(source[dim]) < len(bloc): # This happens when the sampling frequency is anchored thus # source[dim] would be only a few months on the first and last year pass elif len(source[dim]) == len(bloc): out_view.loc[{dim: bloc}] = source.data elif len(bloc) == 365: out_view.loc[{ dim: bloc }] = convert_calendar(source, "365_day").data elif len(bloc) == 366: out_view.loc[{ dim: bloc }] = convert_calendar(source, "366_day", missing=np.NAN).data elif len(bloc) < 365: # 360 days calendar case or anchored years for both source[dim] and bloc case out_view.loc[{dim: bloc}] = source.data[:len(bloc)] else: raise NotImplementedError out_accumulator.append(out_view) return xr.concat(out_accumulator, dim=BOOTSTRAP_DIM)
def _is_all_nan(obs: DataArray, sim: DataArray) -> bool: """Check if all observations or simulations are NaN and log a warning if this is the case. """ all_nan = False if all(obs.isnull()): LOGGER.warning( "All observed values are NaN, thus metrics will be NaN, too.") all_nan = True if all(sim.isnull()): LOGGER.warning( "All simulated values are NaN, thus metrics will be NaN, too.") all_nan = True return all_nan
def fractional_abundance_setup(element: str, t: LabeledArray) -> DataArray: """Calculate and output Fractional abundance at t=infinity for calculating the mean charge in test_impurity_concentration() Parameters ---------- element String of the symbol of the element per ADAS notation e.g be for Beryllium t Times at which to define input_Ne and input_Te (also used for the output) Returns ------- F_z_tinf Fractional abundance of the ionisation stages of the element at t=infinity. """ ADAS_file = ADASReader() SCD = ADAS_file.get_adf11("scd", element, "89") ACD = ADAS_file.get_adf11("acd", element, "89") t = np.linspace(75.0, 80.0, 5) rho_profile = np.array([0.0, 0.4, 0.8, 0.95, 1.0], dtype=float) input_Te = DataArray( data=np.array([3.0e3, 1.5e3, 0.5e3, 0.2e3, 0.1e3]), coords={"rho_poloidal": rho_profile}, dims=["rho_poloidal"], ) input_Ne = DataArray( data=np.array([5.0e19, 4e19, 3.0e19, 2.0e19, 1.0e19]), coords={"rho_poloidal": rho_profile}, dims=["rho_poloidal"], ) example_frac_abundance = FractionalAbundance( SCD, ACD, ) example_frac_abundance.interpolate_rates(Ne=input_Ne, Te=input_Te) example_frac_abundance.calc_ionisation_balance_matrix(Ne=input_Ne) F_z_tinf = example_frac_abundance.calc_F_z_tinf() # ignore with mypy since this is testing and inputs are known F_z_tinf = F_z_tinf.expand_dims({"t": t.size}, axis=-1) # type: ignore return F_z_tinf
def _check_all_nan(obs: DataArray, sim: DataArray): """Check if all observations or simulations are NaN and raise an exception if this is the case. Raises ------ AllNaNError If all observations or all simulations are NaN. """ if all(obs.isnull()): raise AllNaNError( "All observed values are NaN, thus metrics will be NaN, too.") if all(sim.isnull()): raise AllNaNError( "All simulated values are NaN, thus metrics will be NaN, too.")
def hfd_mean(da: DataArray, coord: str = "date") -> float: # determine the date of the first October 1st in the data period first_date = da.coords[coord][0].values.astype("datetime64[s]").astype(datetime) last_date = da.coords[coord][-1].values.astype("datetime64[s]").astype(datetime) if first_date > datetime.strptime(f"{first_date.year}-10-01", "%Y-%m-%d"): start_date = datetime.strptime(f"{first_date.year + 1}-10-01", "%Y-%m-%d") else: start_date = datetime.strptime(f"{first_date.year}-10-01", "%Y-%m-%d") end_date = start_date + relativedelta(years=1) - relativedelta(days=1) doys = [] while end_date < last_date: # compute cumulative sum for the selected period data = da.sel({coord: slice(start_date, end_date)}) cs = data.cumsum(skipna=True) # find days with more cumulative discharge than the half annual sum days = np.where(~np.isnan(cs.where(cs > data.sum(skipna=True) / 2).values))[0] # ignore days without discharge if len(days) > 0: # store the first day in the result array doys.append(days[0]) start_date += relativedelta(years=1) end_date += relativedelta(years=1) return np.mean(doys)
def low_q_dur(da: DataArray, threshold: float = 0.2) -> float: """Calculate low-flow duration. Average duration of low-flow events (number of consecutive steps <`threshold` times the median flow) [#]_, [#]_ (Table 2). Parameters ---------- da : DataArray Array of flow values. threshold : float, optional Low-flow threshold. Values below ``threshold * median`` are considered low flows. Returns ------- float Low-flow duration References ---------- .. [#] Olden, J. D. and Poff, N. L.: Redundancy and the choice of hydrologic indices for characterizing streamflow regimes. River Research and Applications, 2003, 19, 101--121, doi:10.1002/rra.700 .. [#] Westerberg, I. K. and McMillan, H. K.: Uncertainty in hydrological signatures. Hydrology and Earth System Sciences, 2015, 19, 3951--3968, doi:10.5194/hess-19-3951-2015 """ mean_flow = float(da.mean()) idx = np.where(da.values < threshold * mean_flow)[0] if len(idx) > 0: periods = _split_list(idx) lqd = np.mean([len(p) for p in periods]) else: lqd = np.nan return lqd
def hfd_mean(da: DataArray, datetime_coord: str = None) -> float: """Calculate mean half-flow duration. Mean half-flow date (step on which the cumulative discharge since October 1st reaches half of the annual discharge) [#]_. Parameters ---------- da : DataArray Array of flow values. datetime_coord : str, optional Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Returns ------- float Mean half-flow duration References ---------- .. [#] Court, A.: Measures of streamflow timing. Journal of Geophysical Research (1896-1977), 1962, 67, 4335--4339, doi:10.1029/JZ067i011p04335 """ if datetime_coord is None: datetime_coord = utils.infer_datetime_coord(da) # determine the date of the first October 1st in the data period first_date = da.coords[datetime_coord][0].values.astype( 'datetime64[s]').astype(datetime) last_date = da.coords[datetime_coord][-1].values.astype( 'datetime64[s]').astype(datetime) if first_date > datetime.strptime(f'{first_date.year}-10-01', '%Y-%m-%d'): start_date = datetime.strptime(f'{first_date.year + 1}-10-01', '%Y-%m-%d') else: start_date = datetime.strptime(f'{first_date.year}-10-01', '%Y-%m-%d') end_date = start_date + relativedelta(years=1) - relativedelta(seconds=1) doys = [] while end_date < last_date: # compute cumulative sum for the selected period data = da.sel({datetime_coord: slice(start_date, end_date)}) cs = data.cumsum(skipna=True) # find steps with more cumulative discharge than the half annual sum hf_steps = np.where( ~np.isnan(cs.where(cs > data.sum(skipna=True) / 2).values))[0] # ignore days without discharge if len(hf_steps) > 0: # store the first step in the result array doys.append(hf_steps[0]) start_date += relativedelta(years=1) end_date += relativedelta(years=1) return np.mean(doys)
def high_q_dur(da: DataArray, threshold: float = 9.) -> float: """Calculate high-flow duration. Average duration of high-flow events (number of consecutive steps >`threshold` times the median flow) [#]_, [#]_ (Table 2). Parameters ---------- da : DataArray Array of flow values. threshold : float, optional High-flow threshold. Values larger than ``threshold * median`` are considered high flows. Returns ------- float High-flow duration References ---------- .. [#] Clausen, B. and Biggs, B. J. F.: Flow variables for ecological studies in temperate streams: groupings based on covariance. Journal of Hydrology, 2000, 237, 184--197, doi:10.1016/S0022-1694(00)00306-1 .. [#] Westerberg, I. K. and McMillan, H. K.: Uncertainty in hydrological signatures. Hydrology and Earth System Sciences, 2015, 19, 3951--3968, doi:10.5194/hess-19-3951-2015 """ median_flow = float(da.median()) idx = np.where(da.values > threshold * median_flow)[0] if len(idx) > 0: periods = _split_list(idx) hqd = np.mean([len(p) for p in periods]) else: hqd = np.nan return hqd
def parse_longitudinal(data: np.ndarray) -> DataArray: """ Parse raw data into a longitudinal table. Parameters ---------- data: np.ndarray The raw data for a longitudinal table. Returns ------- table: DataArray The created xarray.DataArray """ # load the data table into an XArray table = DataArray( data[:, 1:], dims=["level", "quantity"], coords={ "level": np.arange(data.shape[0]), "quantity": ["depth", "mean", "rms", "stdev", "min", "max"], }, ) # and we are done return table
def _mask_valid(obs: DataArray, sim: DataArray) -> (DataArray, DataArray): # mask of invalid entries idx = (obs >= 0) & (~obs.isnull()) obs = obs[idx] sim = sim[idx] return obs, sim
def low_q_dur(da: DataArray, threshold: float = 0.2) -> float: mean_flow = float(da.mean()) idx = np.where(da.values < threshold * mean_flow)[0] if len(idx) > 0: periods = _split_list(idx) lqd = np.mean([len(p) for p in periods]) else: lqd = np.nan return lqd
def high_q_dur(da: DataArray, threshold: float = 9.) -> float: median_flow = float(da.median()) idx = np.where(da.values > threshold * median_flow)[0] if len(idx) > 0: periods = _split_list(idx) hqd = np.mean([len(p) for p in periods]) else: hqd = np.nan return hqd
def runoff_ratio(da: DataArray, prcp: DataArray, datetime_coord: str = None) -> float: """Calculate runoff ratio. Runoff ratio (ratio of mean discharge to mean precipitation) [#]_ (Eq. 2). Parameters ---------- da : DataArray Array of flow values. prcp : DataArray Array of precipitation values. datetime_coord : str, optional Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Returns ------- float Runoff ratio. References ---------- .. [#] Sawicz, K., Wagener, T., Sivapalan, M., Troch, P. A., and Carrillo, G.: Catchment classification: empirical analysis of hydrologic similarity based on catchment function in the eastern USA. Hydrology and Earth System Sciences, 2011, 15, 2895--2911, doi:10.5194/hess-15-2895-2011 """ if datetime_coord is None: datetime_coord = utils.infer_datetime_coord(da) # rename precip coordinate name (to avoid problems with 'index' or 'date') prcp = prcp.rename({list(prcp.coords.keys())[0]: datetime_coord}) # slice prcp to the same time window as the discharge prcp = prcp.sel({ datetime_coord: slice(da.coords[datetime_coord][0], da.coords[datetime_coord][-1]) }) # calculate runoff ratio value = da.mean() / prcp.mean() return float(value)
def kge(obs: DataArray, sim: DataArray, weights: list = [1, 1, 1]) -> float: if len(weights) != 3: raise ValueError("Weights of the KGE must be a list of three values") # verify inputs _validate_inputs(obs, sim) # get time series with only valid observations obs, sim = _mask_valid(obs, sim) r, _ = stats.pearsonr(obs.values, sim.values) alpha = sim.std() / obs.std() beta = sim.mean() / obs.mean() value = (weights[0] * (r - 1)**2 + weights[1] * (alpha - 1)**2 + weights[2] * (beta - 1)**2) return 1 - np.sqrt(float(value))
def fdc_flv(obs: DataArray, sim: DataArray, l: float = 0.3) -> float: """Low flow bias derived from the flow duration curve. Reference: Yilmaz, K. K., Gupta, H. V., and Wagener, T. ( 2008), A process‐based diagnostic approach to model evaluation: Application to the NWS distributed hydrologic model, Water Resour. Res., 44, W09417, doi:10.1029/2007WR006716. """ # verify inputs _validate_inputs(obs, sim) # get time series with only valid observations obs, sim = _mask_valid(obs, sim) if (l <= 0) or (l >= 1): raise ValueError( "l has to be in range ]0,1[. Consider small values, e.g. 0.3 for 30% low flows" ) # get arrays of sorted (descending) discharges obs = _get_fdc(obs) sim = _get_fdc(sim) # for numerical reasons change 0s to 1e-6. Simulations can still contain negatives, so also reset those. sim[sim <= 0] = 1e-6 obs[obs == 0] = 1e-6 obs = obs[-np.round(l * len(obs)).astype(int) :] sim = sim[-np.round(l * len(sim)).astype(int) :] # transform values to log scale obs = np.log(obs) sim = np.log(sim) # calculate flv part by part qsl = np.sum(sim - sim.min()) qol = np.sum(obs - obs.min()) flv = -1 * (qsl - qol) / (qol + 1e-6) return flv * 100
def nse(obs: DataArray, sim: DataArray) -> float: # verify inputs _validate_inputs(obs, sim) # get time series with only valid observations obs, sim = _mask_valid(obs, sim) denominator = ((obs - obs.mean()) ** 2).sum() numerator = ((sim - obs) ** 2).sum() value = 1 - numerator / denominator return float(value)
def q_mean(da: DataArray) -> float: """Calculate mean discharge. Parameters ---------- da : DataArray Array of flow values. Returns ------- float Mean discharge. """ return float(da.mean())
def q95(da: DataArray) -> float: """Calculate 95th flow quantile. Parameters ---------- da : DataArray Array of flow values. Returns ------- float 95th flow quantile. """ return float(da.quantile(0.95))
def beta_nse(obs: DataArray, sim: DataArray) -> float: r"""Calculate the beta NSE decomposition [#]_ The beta NSE decomposition is the difference of the mean simulation and mean observation divided by the standard deviation of the observations. .. math:: \beta = \frac{\mu_s - \mu_o}{\sigma_o}, where :math:`\mu_s` is the mean of the simulations (here, `sim`), :math:`\mu_o` is the mean of the observations (here, `obs`) and :math:`\sigma_o` the standard deviation of the observations. Parameters ---------- obs : DataArray Observed time series. sim : DataArray Simulated time series. Returns ------- float Beta NSE decomposition. References ---------- .. [#] Gupta, H. V., Kling, H., Yilmaz, K. K., & Martinez, G. F. (2009). Decomposition of the mean squared error and NSE performance criteria: Implications for improving hydrological modelling. Journal of hydrology, 377(1-2), 80-91. """ # verify inputs _validate_inputs(obs, sim) # get time series with only valid observations obs, sim = _mask_valid(obs, sim) return float((sim.mean() - obs.mean()) / obs.std())
def slope_fdc(da: DataArray, lower_quantile: float = 0.33, upper_quantile: float = 0.66) -> float: # sort discharge by descending order fdc = da.sortby(da, ascending=False) # get idx of lower and upper quantile idx_lower = np.round(lower_quantile * len(fdc)).astype(int) idx_upper = np.round(upper_quantile * len(fdc)).astype(int) value = (np.log(fdc[idx_lower].values + 1e-8)) - np.log(fdc[idx_upper].values + 1e-8) / (upper_quantile - lower_quantile) return value
def beta_kge(obs: DataArray, sim: DataArray) -> float: r"""Calculate the beta KGE term [#]_ The beta term of the Kling-Gupta Efficiency is defined as the fraction of the means. .. math:: \beta_{\text{KGE}} = \frac{\mu_s}{\mu_o}, where :math:`\mu_s` is the mean of the simulations (here, `sim`) and :math:`\mu_o` is the mean of the observations (here, `obs`). Parameters ---------- obs : DataArray Observed time series. sim : DataArray Simulated time series. Returns ------- float Beta NSE decomposition. References ---------- .. [#] Gupta, H. V., Kling, H., Yilmaz, K. K., & Martinez, G. F. (2009). Decomposition of the mean squared error and NSE performance criteria: Implications for improving hydrological modelling. Journal of hydrology, 377(1-2), 80-91. """ # verify inputs _validate_inputs(obs, sim) # get time series with only valid observations obs, sim = _mask_valid(obs, sim) return float(sim.mean() / obs.mean())
def baseflow_index(da: DataArray, alpha: float = 0.98, warmup: int = 30) -> (float, DataArray): """Currently just implemented for daily flows (i.e. 3 passes, see Section 2.3 Landson et al. 2013""" # create numpy array from streamflow and add the mirrored discharge of length 'window' to the start and end streamflow = np.zeros((da.size + 2 * warmup)) streamflow[warmup:-warmup] = da.values streamflow[:warmup] = da.values[1:warmup + 1][::-1] streamflow[-warmup:] = da.values[-warmup - 1:-1][::-1] # call jit compiled function to calculate baseflow bf_index, baseflow = _baseflow_index_jit(streamflow, alpha, warmup) # parse baseflow as a DataArray using the coordinates of the streamflow array da_baseflow = da.copy() da_baseflow.data = baseflow return bf_index, da_baseflow