Ejemplo n.º 1
0
def test_infer_frequency():
    """Test the logic to infer frequencies. """
    assert infer_frequency(pd.date_range('2000-01-01', '2000-01-10', freq='D')) == '1D'
    assert infer_frequency(pd.date_range('2000-01-01', '2000-01-10', freq='48H')) == '2D'
    assert infer_frequency(pd.date_range('2000-01-01', '2000-03-01', freq='W-MON')) == '1W-MON'

    # just a single date
    pytest.raises(ValueError, infer_frequency, [pd.to_datetime('2000-01-01')])
    # frequency of zero
    pytest.raises(ValueError, infer_frequency,
                  [pd.to_datetime('2000-01-01'),
                   pd.to_datetime('2000-01-01'),
                   pd.to_datetime('2000-01-01')])
    # irregular dates
    pytest.raises(ValueError, infer_frequency,
                  [pd.to_datetime('2000-01-01'),
                   pd.to_datetime('2000-01-02'),
                   pd.to_datetime('2000-01-04')])
Ejemplo n.º 2
0
def baseflow_index(da: DataArray,
                   alpha: float = 0.98,
                   warmup: int = 30,
                   n_passes: int = None,
                   datetime_coord: str = None) -> Tuple[float, DataArray]:
    """Calculate baseflow index.

    Ratio of mean baseflow to mean discharge [#]_. If `da` contains NaN values, the baseflow is calculated for each
    consecutive segment of more than `warmup` non-NaN values.

    Parameters
    ----------
    da : DataArray
        Array of flow values.
    alpha : float, optional
        alpha filter parameter.
    warmup : int, optional
        Number of warmup steps.
    n_passes : int, optional
        Number of passes (alternating forward and backward) to perform. Should be an odd number. If None, will use
        3 for daily and 9 for hourly data and fail for all other input frequencies.
    datetime_coord : str, optional
        Datetime coordinate in the passed DataArray. Tried to infer automatically if not specified. Used to infer the 
        frequency if `n_passes` is None.

    Returns
    -------
    Tuple[float, DataArray]
        Baseflow index and baseflow array. The baseflow array contains NaNs wherever no baseflow was
        calculated due to NaNs in `da`.

    Raises
    ------
    ValueError
        If `da` has a frequency other than daily or hourly and `n_passes` is None.

    References
    ----------
    .. [#] Ladson, T. R., Brown, R., Neal, B., and Nathan, R.: A Standard Approach to Baseflow Separation Using The
        Lyne and Hollick Filter. Australasian Journal of Water Resources, Taylor & Francis, 2013, 17, 25--34,
        doi:10.7158/13241583.2013.11465417
    """
    if datetime_coord is None:
        datetime_coord = utils.infer_datetime_coord(da)

    if n_passes is None:
        freq = utils.infer_frequency(da[datetime_coord].values)
        if freq == '1D':
            n_passes = 3
        elif freq == '1H':
            n_passes = 9
        else:
            raise ValueError(
                f'For frequencies other than daily or hourly, n_passes must be specified.'
            )
    if n_passes % 2 != 1:
        warnings.warn(
            'n_passes should be an even number. The returned baseflow will be reversed.'
        )

    # call jit compiled function to calculate baseflow
    bf_index, baseflow = _baseflow_index_jit(da.values, alpha, warmup,
                                             n_passes)

    # parse baseflow as a DataArray using the coordinates of the streamflow array
    da_baseflow = da.copy()
    da_baseflow.data = baseflow

    return bf_index, da_baseflow
Ejemplo n.º 3
0
    def _load_or_create_xarray_dataset(self) -> xarray.Dataset:
        # if no netCDF file is passed, data set is created from raw basin files
        if (self.cfg.train_data_file is None) or (not self.is_train):
            data_list = []

            # list of columns to keep, everything else will be removed to reduce memory footprint
            keep_cols = self.cfg.target_variables + self.cfg.evolving_attributes
            if isinstance(self.cfg.dynamic_inputs, list):
                keep_cols += self.cfg.dynamic_inputs
            else:
                # keep all frequencies' dynamic inputs
                keep_cols += [
                    i for inputs in self.cfg.dynamic_inputs.values()
                    for i in inputs
                ]
            # make sure that even inputs that are used in multiple frequencies occur only once in the df
            keep_cols = list(sorted(set(keep_cols)))

            if not self._disable_pbar:
                LOGGER.info("Loading basin data into xarray data set.")
            for basin in tqdm(self.basins,
                              disable=self._disable_pbar,
                              file=sys.stdout):
                df = self._load_basin_data(basin)

                # add columns from dataframes passed as additional data files
                df = pd.concat(
                    [df, *[d[basin] for d in self.additional_features]],
                    axis=1)

                # check if any feature should be duplicated
                df = self._duplicate_features(df)

                # check if a shifted copy of a feature should be added
                df = self._add_lagged_features(df)

                # remove unnecessary columns
                df = df[keep_cols]

                # make end_date the last second of the specified day, such that the
                # dataset will include all hours of the last day, not just 00:00.
                start_dates = self.dates[basin]["start_dates"]
                end_dates = [
                    date + pd.Timedelta(days=1, seconds=-1)
                    for date in self.dates[basin]["end_dates"]
                ]

                native_frequency = utils.infer_frequency(df.index)
                if not self.frequencies:
                    self.frequencies = [
                        native_frequency
                    ]  # use df's native resolution by default
                if any([
                        pd.to_timedelta(freq) <
                        pd.to_timedelta(native_frequency)
                        for freq in self.frequencies
                ]):
                    raise ValueError(
                        f'Frequency is higher than native data frequency {native_frequency}.'
                    )

                # get maximum warmup-offset across all frequencies
                offset = max([(self.seq_len[i] - self._predict_last_n[i]) *
                              pd.to_timedelta(freq)
                              for i, freq in enumerate(self.frequencies)])

                # create xarray data set for each period slice of the specific basin
                for i, (start_date,
                        end_date) in enumerate(zip(start_dates, end_dates)):
                    # add warmup period, so that we can make prediction at the first time step specified by period
                    warmup_start_date = start_date - offset
                    df_sub = df[warmup_start_date:end_date]

                    # make sure the df covers the full date range from warmup_start_date to end_date, filling any gaps
                    # with NaNs. This may increase runtime, but is a very robust way to make sure dates and predictions
                    # keep in sync. In training, the introduced NaNs will be discarded, so this only affects evaluation.
                    full_range = pd.date_range(start=warmup_start_date,
                                               end=end_date,
                                               freq=native_frequency)
                    df_sub = df_sub.reindex(
                        pd.DatetimeIndex(full_range, name=df_sub.index.name))

                    # as double check, set all targets before period start to NaN
                    df_sub.loc[df_sub.index < start_date,
                               self.cfg.target_variables] = np.nan

                    # For multiple slices per basin, a number is added to the basin string starting from the 2nd slice
                    xr = xarray.Dataset.from_dataframe(df_sub)
                    basin_str = basin if i == 0 else f"{basin}_period{i}"
                    xr = xr.assign_coords({'basin': basin_str})
                    data_list.append(xr.astype(np.float32))

            # create one large dataset that has two coordinates: datetime and basin
            xr = xarray.concat(data_list, dim="basin")

            if self.is_train and self.cfg.save_train_data:
                self._save_xarray_dataset(xr)

        else:
            with self.cfg.train_data_file.open("rb") as fp:
                d = pickle.load(fp)
            xr = xarray.Dataset.from_dict(d)
            if not self.frequencies:
                native_frequency = utils.infer_frequency(xr["date"].values)
                self.frequencies = [native_frequency]

        return xr
Ejemplo n.º 4
0
    def _load_or_create_xarray_dataset(self) -> xarray.Dataset:
        # if no netCDF file is passed, data set is created from raw basin files
        if (self.cfg.train_data_file is None) or (not self.is_train):
            data_list = []

            # list of columns to keep, everything else will be removed to reduce memory footprint
            keep_cols = self.cfg.target_variables + self.cfg.evolving_attributes + self.cfg.mass_inputs

            if isinstance(self.cfg.dynamic_inputs, list):
                keep_cols += self.cfg.dynamic_inputs
            else:
                # keep all frequencies' dynamic inputs
                keep_cols += [i for inputs in self.cfg.dynamic_inputs.values() for i in inputs]
            # make sure that even inputs that are used in multiple frequencies occur only once in the df

            keep_cols = list(sorted(set(keep_cols)))

            if not self._disable_pbar:
                LOGGER.info("Loading basin data into xarray data set.")
            for basin in tqdm(self.basins, disable=self._disable_pbar, file=sys.stdout):
                df = self._load_basin_data(basin)

                # add columns from dataframes passed as additional data files
                df = pd.concat([df, *[d[basin] for d in self.additional_features]], axis=1)

                # check if any feature should be duplicated
                df = self._duplicate_features(df)

                # check if a shifted copy of a feature should be added
                df = self._add_lagged_features(df)

                # remove unnecessary columns
                try:
                    df = df[keep_cols]
                except KeyError:
                    not_available_columns = [x for x in keep_cols if x not in df.columns]
                    msg = [
                        f"The following features are not available in the data: {not_available_columns}. ",
                        f"These are the available features: {df.columns.tolist()}"
                    ]
                    raise KeyError("".join(msg))

                # make end_date the last second of the specified day, such that the
                # dataset will include all hours of the last day, not just 00:00.
                start_dates = self.dates[basin]["start_dates"]
                end_dates = [date + pd.Timedelta(days=1, seconds=-1) for date in self.dates[basin]["end_dates"]]

                native_frequency = utils.infer_frequency(df.index)
                if not self.frequencies:
                    self.frequencies = [native_frequency]  # use df's native resolution by default

                # Assert that the used frequencies are lower or equal than the native frequency. There may be cases
                # where our logic cannot determine whether this is the case, because pandas might return an exotic
                # native frequency. In this case, all we can do is print a warning and let the user check themselves.
                try:
                    freq_vs_native = [utils.compare_frequencies(freq, native_frequency) for freq in self.frequencies]
                except ValueError:
                    LOGGER.warning('Cannot compare provided frequencies with native frequency. '
                                   'Make sure the frequencies are not higher than the native frequency.')
                    freq_vs_native = []
                if any(comparison > 1 for comparison in freq_vs_native):
                    raise ValueError(f'Frequency is higher than native data frequency {native_frequency}.')

                # used to get the maximum warmup-offset across all frequencies. We don't use to_timedelta because it
                # does not support all frequency strings. We can't calculate the maximum offset here, because to
                # compare offsets, they need to be anchored to a specific date (here, the start date).
                offsets = [(self.seq_len[i] - self._predict_last_n[i]) * to_offset(freq)
                           for i, freq in enumerate(self.frequencies)]

                # create xarray data set for each period slice of the specific basin
                for i, (start_date, end_date) in enumerate(zip(start_dates, end_dates)):
                    # if the start date is not aligned with the frequency, the resulting datetime indices will be off
                    if not all(to_offset(freq).is_on_offset(start_date) for freq in self.frequencies):
                        misaligned = [freq for freq in self.frequencies if not to_offset(freq).is_on_offset(start_date)]
                        raise ValueError(f'start date {start_date} is not aligned with frequencies {misaligned}.')
                    # add warmup period, so that we can make prediction at the first time step specified by period.
                    # offsets has the warmup offset needed for each frequency; the overall warmup starts with the
                    # earliest date, i.e., the largest offset across all frequencies.
                    warmup_start_date = min(start_date - offset for offset in offsets)
                    df_sub = df[warmup_start_date:end_date]

                    # make sure the df covers the full date range from warmup_start_date to end_date, filling any gaps
                    # with NaNs. This may increase runtime, but is a very robust way to make sure dates and predictions
                    # keep in sync. In training, the introduced NaNs will be discarded, so this only affects evaluation.
                    full_range = pd.date_range(start=warmup_start_date, end=end_date, freq=native_frequency)
                    df_sub = df_sub.reindex(pd.DatetimeIndex(full_range, name=df_sub.index.name))

                    # as double check, set all targets before period start to NaN
                    df_sub.loc[df_sub.index < start_date, self.cfg.target_variables] = np.nan

                    # For multiple slices per basin, a number is added to the basin string starting from the 2nd slice
                    xr = xarray.Dataset.from_dataframe(df_sub)
                    basin_str = basin if i == 0 else f"{basin}_period{i}"
                    xr = xr.assign_coords({'basin': basin_str})
                    data_list.append(xr.astype(np.float32))

            # create one large dataset that has two coordinates: datetime and basin
            xr = xarray.concat(data_list, dim="basin")

            if self.is_train and self.cfg.save_train_data:
                self._save_xarray_dataset(xr)

        else:
            with self.cfg.train_data_file.open("rb") as fp:
                d = pickle.load(fp)
            xr = xarray.Dataset.from_dict(d)
            if not self.frequencies:
                native_frequency = utils.infer_frequency(xr["date"].values)
                self.frequencies = [native_frequency]

        return xr