Beispiel #1
0
    def _load_basin_data(self, basin: str) -> pd.DataFrame:
        """Load input and output data from text files."""
        # get forcings
        dfs = []
        if not any(f.endswith('_hourly') for f in self.cfg.forcings):
            raise ValueError('Forcings include no hourly forcings set.')
        for forcing in self.cfg.forcings:
            if forcing[-7:] == '_hourly':
                df = self.load_hourly_data(basin, forcing)
            else:
                # load daily CAMELS forcings and upsample to hourly
                df, _ = load_camels_us_forcings(self.cfg.data_dir, basin,
                                                forcing)
                df = df.resample('1H').ffill()
            if len(self.cfg.forcings) > 1:
                # rename columns
                df = df.rename(
                    columns={
                        col: f"{col}_{forcing}"
                        for col in df.columns if 'qobs' not in col.lower()
                    })
            dfs.append(df)
        df = pd.concat(dfs, axis=1)

        # replace invalid discharge values by NaNs
        qobs_cols = [col for col in df.columns if 'qobs' in col.lower()]
        for col in qobs_cols:
            df.loc[df[col] < 0, col] = np.nan

        # add stage, if requested
        if 'gauge_height_m' in self.cfg.target_variables:
            df = df.join(load_hourly_us_stage(self.cfg.data_dir, basin))
            df.loc[df['gauge_height_m'] < 0, 'gauge_height_m'] = np.nan

        # convert discharge to 'synthetic' stage, if requested
        if 'synthetic_qobs_stage_meters' in self.cfg.target_variables:
            attributes = load_camels_us_attributes(data_dir=self.cfg.data_dir,
                                                   basins=[basin])
            with open(self.cfg.rating_curve_file, 'rb') as f:
                rating_curves = pickle.load(f)
            df['synthetic_qobs_stage_meters'] = np.nan
            if basin in rating_curves.keys():
                discharge_m3s = df[
                    'qobs_mm_per_hour'].values / 1000 * attributes.area_gages2[
                        basin] * 1e6 / 60**2
                df['synthetic_qobs_stage_meters'] = rating_curves[
                    basin].discharge_to_stage(discharge_m3s)

        return df
def calculate_camels_us_dyn_climate_indices(data_dir: Path,
                                         basins: List[str],
                                         window_length: int,
                                         forcings: str,
                                         variable_names: Dict[str, str] = None,
                                         output_file: Path = None) -> Dict[str, pd.DataFrame]:
    """Calculate dynamic climate indices for the CAMELS US dataset.
    
    Compared to the long-term static climate indices included in the CAMELS US data set, this function computes the same
    climate indices by a moving window approach over the entire data set. That is, for each time step, the climate 
    indices are re-computed from the last `window_length` time steps. The resulting dictionary of DataFrames can be
    used with the `additional_feature_files` argument.
    Unlike in CAMELS, the '_freq' indices will be fractions, not number of days. To compare the values to the ones in
    CAMELS, they need to be multiplied by 365.25.
    
    Parameters
    ----------
    data_dir : Path
        Path to the CAMELS US directory.
    basins : List[str]
        List of basin ids.
    window_length : int
        Look-back period to use to compute the climate indices.
    forcings : str
        Can be e.g. 'daymet' or 'nldas', etc. Must match the folder names in the 'basin_mean_forcing' directory.
    variable_names : Dict[str, str], optional
        Mapping of the forcings' variable names, needed if forcings other than DayMet, Maurer, or NLDAS are used.
        If provided, this must be a dictionary that maps the keys 'prcp', 'tmin', 'tmax', 'srad' to the forcings'
        respective variable names.
    output_file : Path, optional
        If specified, stores the resulting dictionary of DataFrames to this location as a pickle dump.

    Returns
    -------
    Dict[str, pd.DataFrame]
        Dictionary with one time-indexed DataFrame per basin. By definition, the climate indices for a given day in the
        DataFrame are computed from the `window_length` previous time steps (including the given day).
    """
    camels_attributes = load_camels_us_attributes(data_dir=data_dir, basins=basins)
    additional_features = {}

    if variable_names is None:
        if forcings.startswith('nldas'):
            variable_names = {'prcp': 'PRCP(mm/day)', 'tmin': 'Tmin(C)', 'tmax': 'Tmax(C)', 'srad': 'SRAD(W/m2)'}
        elif forcings.startswith('daymet') or forcings.startswith('maurer'):
            variable_names = {'prcp': 'prcp(mm/day)', 'tmin': 'tmin(C)', 'tmax': 'tmax(C)', 'srad': 'srad(W/m2)'}
        else:
            raise ValueError(f'No predefined variable mapping for {forcings} forcings. Provide one in variable_names.')

    for basin in tqdm(basins, file=sys.stdout):
        df, _ = load_camels_us_forcings(data_dir=data_dir, basin=basin, forcings=forcings)
        lat = camels_attributes.loc[camels_attributes.index == basin, 'gauge_lat'].values
        elev = camels_attributes.loc[camels_attributes.index == basin, 'elev_mean'].values
        df["PET(mm/d)"] = pet.get_priestley_taylor_pet(t_min=df[variable_names['tmin']].values,
                                                       t_max=df[variable_names['tmax']].values,
                                                       s_rad=df[variable_names['srad']].values,
                                                       lat=lat,
                                                       elev=elev,
                                                       doy=df.index.dayofyear.values)

        clim_indices = calculate_dyn_climate_indices(df[variable_names['prcp']],
                                                     df[variable_names['tmax']],
                                                     df[variable_names['tmin']],
                                                     df['PET(mm/d)'],
                                                     window_length=window_length)

        if np.any(clim_indices.isna()):
            raise ValueError(f"NaN in new features of basin {basin}")

        clim_indices = clim_indices.reindex(df.index)  # add NaN rows for the first window_length - 1 entries
        additional_features[basin] = clim_indices

    if output_file is not None:
        with output_file.open("wb") as fp:
            pickle.dump(additional_features, fp)
        LOGGER.info(f"Precalculated features successfully stored at {output_file}")

    return additional_features
    def _load_basin_data(self, basin: str) -> pd.DataFrame:
        """Load input and output data from text files."""
        # get forcings
        dfs = []
        if not any(f.endswith('_hourly') for f in self.cfg.forcings):
            raise ValueError('Forcings include no hourly forcings set.')
        for forcing in self.cfg.forcings:
            if forcing[-7:] == '_hourly':
                df = self.load_hourly_data(basin, forcing)
            else:
                # load daily CAMELS forcings and upsample to hourly
                df, _ = camelsus.load_camels_us_forcings(
                    self.cfg.data_dir, basin, forcing)
                df = df.resample('1H').ffill()
            if len(self.cfg.forcings) > 1:
                # rename columns
                df = df.rename(
                    columns={
                        col: f"{col}_{forcing}"
                        for col in df.columns if 'qobs' not in col.lower()
                    })
            dfs.append(df)
        df = pd.concat(dfs, axis=1)

        # collapse all input features to a single list, to check for 'QObs(mm/d)'.
        all_features = self.cfg.target_variables
        if isinstance(self.cfg.dynamic_inputs, dict):
            for val in self.cfg.dynamic_inputs.values():
                all_features = all_features + val
        elif isinstance(self.cfg.dynamic_inputs, list):
            all_features = all_features + self.cfg.dynamic_inputs

        # catch also QObs(mm/d)_shiftX or _copyX features
        if any([x.startswith("QObs(mm/d)") for x in all_features]):
            # add daily discharge from CAMELS, using daymet to get basin area
            _, area = camelsus.load_camels_us_forcings(self.cfg.data_dir,
                                                       basin, "daymet")
            discharge = camelsus.load_camels_us_discharge(
                self.cfg.data_dir, basin, area)
            discharge = discharge.resample('1H').ffill()
            df["QObs(mm/d)"] = discharge

        # only warn for missing netcdf files once for each forcing product
        self._warn_slow_loading = False

        # replace invalid discharge values by NaNs
        qobs_cols = [col for col in df.columns if 'qobs' in col.lower()]
        for col in qobs_cols:
            df.loc[df[col] < 0, col] = np.nan

        # add stage, if requested
        if 'gauge_height_m' in self.cfg.target_variables:
            df = df.join(load_hourly_us_stage(self.cfg.data_dir, basin))
            df.loc[df['gauge_height_m'] < 0, 'gauge_height_m'] = np.nan

        # convert discharge to 'synthetic' stage, if requested
        if 'synthetic_qobs_stage_meters' in self.cfg.target_variables:
            attributes = camelsus.load_camels_us_attributes(
                data_dir=self.cfg.data_dir, basins=[basin])
            with open(self.cfg.rating_curve_file, 'rb') as f:
                rating_curves = pickle.load(f)
            df['synthetic_qobs_stage_meters'] = np.nan
            if basin in rating_curves.keys():
                discharge_m3s = df[
                    'qobs_mm_per_hour'].values / 1000 * attributes.area_gages2[
                        basin] * 1e6 / 60**2
                df['synthetic_qobs_stage_meters'] = rating_curves[
                    basin].discharge_to_stage(discharge_m3s)

        return df