def test_error_column_missing(self):
     """Test that an error is raised if dataframe does not contain one of
     REPRESENTATION_COLUMNS."""
     msg = "None of the columns"
     with self.assertRaisesRegex(ValueError, msg):
         forecast_and_truth_dataframes_to_cubes(
             self.forecast_df.drop(columns=["percentile"]),
             self.truth_subset_df,
             self.cycletime,
             self.forecast_period,
             self.training_length,
         )
Exemple #2
0
 def test_not_quantiles(self):
     """Test if the percentiles can not be considered to be quantiles."""
     forecast_df = self.forecast_df.copy()
     forecast_df = forecast_df.replace({"percentile": self.percentiles[0]},
                                       10.0)
     msg = "The forecast percentiles can not be considered as quantiles"
     with self.assertRaisesRegex(ValueError, msg):
         forecast_and_truth_dataframes_to_cubes(
             forecast_df,
             self.truth_subset_df,
             self.cycletime,
             self.forecast_period,
             self.training_length,
         )
Exemple #3
0
 def test_truth_missing_compulsory_columns(self):
     """Test if there are missing compulsory columns in the truth
     dataframe."""
     df = self.truth_subset_df.copy()
     df = df.rename(columns={"diagnostic": "diag"})
     msg = "The following compulsory column\\(s\\) are missing"
     with self.assertRaisesRegex(ValueError, msg):
         forecast_and_truth_dataframes_to_cubes(
             self.forecast_df,
             df,
             self.cycletime,
             self.forecast_period,
             self.training_length,
         )
 def test_error_multiple_columns(self):
     """Test that an error is raised if dataframe contains more than one of
     REPRESENTATION_COLUMNS."""
     msg = "More than one column"
     df = self.forecast_df.copy()
     df["realization"] = 0
     with self.assertRaisesRegex(ValueError, msg):
         forecast_and_truth_dataframes_to_cubes(
             df,
             self.truth_subset_df,
             self.cycletime,
             self.forecast_period,
             self.training_length,
         )
Exemple #5
0
 def test_error_multiple_experiment_values(self):
     """Test an error is raised if multiple experiment values are in
     the dataframe."""
     experiment2 = self.forecast_df.copy()
     experiment2["experiment"] = "threshold"
     forecast_df = pd.concat([self.forecast_df, experiment2])
     msg = "More than one value for the experiment column found"
     with self.assertRaisesRegex(ValueError, msg):
         forecast_and_truth_dataframes_to_cubes(
             forecast_df,
             self.truth_subset_df,
             self.cycletime,
             self.forecast_period,
             self.training_length,
         )
Exemple #6
0
 def test_forecast_missing_columns_and_additional_columns(self):
     """Test if there are missing compulsory columns in the forecast
     dataframe and there are additional non-compulsory columns."""
     df = self.forecast_df.copy()
     df["station_id"] = "11111"
     df = df.rename(columns={"diagnostic": "diag"})
     msg = "The following compulsory column\\(s\\) are missing"
     with self.assertRaisesRegex(ValueError, msg):
         forecast_and_truth_dataframes_to_cubes(
             df,
             self.truth_subset_df,
             self.cycletime,
             self.forecast_period,
             self.training_length,
         )
Exemple #7
0
    def test_duplicate_row_truths(self):
        """Test that a truth cube is still produced if duplicated
        truths for a given validity time are provided."""
        # Duplicate first row twice.
        truth_df_with_duplicates = pd.concat(
            [
                self.truth_subset_df,
                self.truth_subset_df.iloc[[0]],
                self.truth_subset_df.iloc[[0]],
            ],
            ignore_index=True,
        )
        truth_df_with_duplicates.at[0, "ob_value"] = 6.0
        truth_df_with_duplicates.at[9, "ob_value"] = 8.0
        result = forecast_and_truth_dataframes_to_cubes(
            self.forecast_df,
            truth_df_with_duplicates,
            self.cycletime,
            self.forecast_period,
            self.training_length,
        )

        self.assertEqual(len(result), 2)
        self.assertCubeEqual(result[0], self.expected_period_forecast)
        self.assertCubeEqual(result[1], self.expected_period_truth)
Exemple #8
0
 def test_basic(self):
     """Test the expected cubes are generated from the input dataframes."""
     result = forecast_and_truth_dataframes_to_cubes(
         self.forecast_df,
         self.truth_subset_df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_period_forecast)
     self.assertCubeEqual(result[1], self.expected_period_truth)
 def test_station_id_and_wmo_id(self):
     """Test that when station_id is present in both forecast and truth dataframes,
     output cubes contain station_id coordinate."""
     result = forecast_and_truth_dataframes_to_cubes(
         self.forecast_df_station_id,
         self.truth_df_station_id,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_forecast_station_id)
     self.assertCubeEqual(result[1], self.expected_truth_station_id)
Exemple #10
0
 def test_truth_additional_columns_present(self):
     """Test that if there are additional columns present
     in the truth dataframe, these have no impact."""
     df = self.truth_subset_df.copy()
     df["station_id"] = "11111"
     result = forecast_and_truth_dataframes_to_cubes(
         self.forecast_df,
         df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
Exemple #11
0
 def test_no_forecasts_for_a_time(self):
     """Test for a time point having no forecasts."""
     forecast_df = self.forecast_df[self.forecast_df["time"].isin(
         [self.time2, self.time3])]
     result = forecast_and_truth_dataframes_to_cubes(
         forecast_df,
         self.truth_subset_df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_period_forecast[:, 1:])
     self.assertCubeEqual(result[1], self.expected_period_truth[1:])
Exemple #12
0
 def test_duplicate_cycle_truths(self):
     """Test that a truth cube is still produced if duplicate
     truths for a given validity time are provided."""
     truth_df_with_duplicates = self.truth_subset_df.append(
         self.truth_subset_df.iloc[:3], ignore_index=True)
     result = forecast_and_truth_dataframes_to_cubes(
         self.forecast_df,
         truth_df_with_duplicates,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_period_forecast)
     self.assertCubeEqual(result[1], self.expected_period_truth)
Exemple #13
0
 def test_duplicate_cycle_forecasts(self):
     """Test that a forecast cube is still produced if a duplicated
     cycle of forecasts is provided."""
     forecast_df_with_duplicates = self.forecast_df.append(
         self.forecast_df.iloc[:9], ignore_index=True)
     result = forecast_and_truth_dataframes_to_cubes(
         forecast_df_with_duplicates,
         self.truth_subset_df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_period_forecast)
     self.assertCubeEqual(result[1], self.expected_period_truth)
Exemple #14
0
 def test_missing_observation_at_end(self):
     """Test a truth DataFrame with one missing observation
     within the last row of the dataframe is converted correctly
     into an iris Cube."""
     df = self.truth_subset_df.drop(8)
     self.expected_period_truth.data[-1, -1] = np.nan
     result = forecast_and_truth_dataframes_to_cubes(
         self.forecast_df,
         df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_period_forecast)
     self.assertCubeEqual(result[1], self.expected_period_truth)
Exemple #15
0
 def test_site_absent_from_truth(self):
     """Test for when a site is absent from the truth dataframe."""
     df = self.truth_subset_df.copy()
     df = df.loc[df["wmo_id"].isin(self.wmo_ids[:-1])]
     expected_forecast = self.expected_period_forecast[:, :, :-1]
     expected_truth = self.expected_period_truth[:, :-1]
     result = forecast_and_truth_dataframes_to_cubes(
         self.forecast_df,
         df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], expected_forecast)
     self.assertCubeEqual(result[1], expected_truth)
Exemple #16
0
 def test_site_coord_mismatch(self):
     """Test for a mismatch in the location of a site between the truths
     and forecasts. In this case, the position (lat/lon/alt) from the
     forecast will be used."""
     df = self.truth_subset_df.copy()
     df.at[::3, "altitude"] = 45
     df.at[::3, "latitude"] = 52
     df.at[::3, "longitude"] = -12
     result = forecast_and_truth_dataframes_to_cubes(
         self.forecast_df,
         df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_period_forecast)
     self.assertCubeEqual(result[1], self.expected_period_truth)
 def test_units_in_truth(self):
     """Test that if truth_df contains a units column, it is used
     for units of truth output cube."""
     truth_df = self.truth_subset_df.copy()
     truth_df["units"] = "Fahrenheit"
     truth_df["ob_value"] = truth_df["ob_value"] + 30
     expected_truth = self.expected_period_truth.copy()
     expected_truth.units = "Fahrenheit"
     expected_truth.data = expected_truth.data + 30
     result = forecast_and_truth_dataframes_to_cubes(
         self.forecast_df,
         truth_df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_period_forecast)
     self.assertCubeEqual(result[1], expected_truth)
Exemple #18
0
 def test_select_single_experiment_value(self):
     """Test selecting a single experiment value from the dataframe"""
     experiment2 = self.forecast_df.copy()
     experiment2["experiment"] = "threshold"
     # Set original data to different values to make sure the correct experiment
     # is picked up
     self.forecast_df["forecast"] = 0.0
     forecast_df = pd.concat([self.forecast_df, experiment2])
     result = forecast_and_truth_dataframes_to_cubes(
         forecast_df,
         self.truth_subset_df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
         experiment="threshold",
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], self.expected_period_forecast)
     self.assertCubeEqual(result[1], self.expected_period_truth)
 def test_station_id_dummy_wmo_id(self):
     """Test that when station_id is present and wmo_id contains dummy data,
     station_id is used to match forecast and truth cubes."""
     forecast_df = self.forecast_df_station_id.copy()
     forecast_df["wmo_id"] = "00000"
     truth_df = self.truth_df_station_id.copy()
     truth_df["wmo_id"] = "00000"
     expected_truth = self.expected_truth_station_id.copy()
     expected_truth.coord("wmo_id").points = ["00000"]
     expected_forecast = self.expected_forecast_station_id.copy()
     expected_forecast.coord("wmo_id").points = ["00000"]
     result = forecast_and_truth_dataframes_to_cubes(
         forecast_df,
         truth_df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], expected_forecast)
     self.assertCubeEqual(result[1], expected_truth)
Exemple #20
0
 def test_percentile_extract(self):
     """Test the desired percentiles are extracted."""
     expected_period_forecast = self.expected_period_forecast[::2]
     expected_period_forecast.coord("realization").points = np.array(
         [0, 1], dtype=np.int32)
     forecast_df = self.forecast_df.copy()
     forecast_df = forecast_df.replace({"percentile": self.percentiles[0]},
                                       100 / 3)
     forecast_df = forecast_df.replace({"percentile": self.percentiles[2]},
                                       (2 / 3) * 100)
     result = forecast_and_truth_dataframes_to_cubes(
         forecast_df,
         self.truth_subset_df,
         self.cycletime,
         self.forecast_period,
         self.training_length,
         percentiles=["33.333333", "66.666666"],
     )
     self.assertEqual(len(result), 2)
     self.assertCubeEqual(result[0], expected_period_forecast)
     self.assertCubeEqual(result[1], self.expected_period_truth)
Exemple #21
0
    def test_multiday_forecast_period(self):
        """Test for a multi-day forecast period to ensure that the
        validity times within the training dataset are always in
        the past, relative to the cycletime."""
        forecast_period = 30 * 3600
        forecast_df = self.forecast_df.copy()
        forecast_df["forecast_period"] = np.timedelta64(
            forecast_period, "s").astype("timedelta64[ns]")
        for coord in ["forecast_reference_time", "blend_time"]:
            forecast_df[coord] = forecast_df[coord].replace(
                to_replace={
                    self.frt1: self.frt1 - pd.Timedelta(1, days=1),
                    self.frt2: self.frt2 - pd.Timedelta(1, days=1),
                    self.frt3: self.frt3 - pd.Timedelta(1, days=1),
                })

        fp_int = pd.Timedelta(forecast_period, "s").total_seconds()
        self.expected_period_forecast.coord(
            "forecast_period").points = np.array(
                fp_int, dtype=TIME_COORDS["forecast_period"].dtype)
        self.expected_period_forecast.coord(
            "forecast_period").bounds = np.array(
                [fp_int - self.period.total_seconds(), fp_int],
                dtype=TIME_COORDS["forecast_period"].dtype,
            )

        result = forecast_and_truth_dataframes_to_cubes(
            forecast_df,
            self.truth_subset_df,
            self.cycletime,
            forecast_period,
            self.training_length,
        )
        self.assertEqual(len(result), 2)
        self.assertCubeEqual(result[0], self.expected_period_forecast)
        self.assertCubeEqual(result[1], self.expected_period_truth)
Exemple #22
0
    def test_duplicate_row_forecasts(self):
        """Test that a forecast cube is still produced if duplicated
        forecasts are provided."""
        # Use results from the first realization,
        # equivalent to the 50th percentile.
        expected_period_forecast = self.expected_period_forecast[1, :, :]
        expected_period_forecast.coord("realization").points = np.array(
            [0], np.int32)

        # Use 50th percentile only
        forecast_subset_df = self.forecast_df[self.forecast_df["percentile"] ==
                                              50.0]

        # Duplicate first row twice.
        forecast_df_with_duplicates = pd.concat(
            [
                forecast_subset_df,
                forecast_subset_df.iloc[[0]],
                forecast_subset_df.iloc[[0]],
            ],
            ignore_index=True,
        )
        forecast_df_with_duplicates.at[0, "forecast"] = 6.0
        forecast_df_with_duplicates.at[9, "forecast"] = 8.0

        result = forecast_and_truth_dataframes_to_cubes(
            forecast_df_with_duplicates,
            self.truth_subset_df,
            self.cycletime,
            self.forecast_period,
            self.training_length,
        )

        self.assertEqual(len(result), 2)
        self.assertCubeEqual(result[0], expected_period_forecast)
        self.assertCubeEqual(result[1], self.expected_period_truth)
Exemple #23
0
def process(
    forecast: cli.inputpath,
    truth: cli.inputpath,
    additional_predictors: cli.inputcubelist = None,
    *,
    diagnostic,
    cycletime,
    forecast_period,
    training_length,
    distribution,
    point_by_point=False,
    use_default_initial_guess=False,
    units=None,
    predictor="mean",
    tolerance: float = 0.02,
    max_iterations: int = 1000,
    percentiles: cli.comma_separated_list = None,
    experiment: str = None,
):
    """Estimate coefficients for Ensemble Model Output Statistics.

    Loads in arguments for estimating coefficients for Ensemble Model
    Output Statistics (EMOS), otherwise known as Non-homogeneous Gaussian
    Regression (NGR). Two sources of input data must be provided: historical
    forecasts and historical truth data (to use in calibration).
    The estimated coefficients are output as a cube.

    Args:
        forecast (pathlib.Path):
            The path to a Parquet file containing the historical forecasts
            to be used for calibration.The expected columns within the
            Parquet file are: forecast, blend_time, forecast_period,
            forecast_reference_time, time, wmo_id, percentile, diagnostic,
            latitude, longitude, period, height, cf_name, units.
        truth (pathlib.Path):
            The path to a Parquet file containing the truths to be used
            for calibration. The expected columns within the
            Parquet file are: ob_value, time, wmo_id, diagnostic, latitude,
            longitude and altitude.
        additional_predictors (iris.cube.Cube):
            A cube for a static additional predictor to be used, in addition
            to the forecast, when estimating the EMOS coefficients.
        diagnostic (str):
            The name of the diagnostic to be calibrated within the forecast
            and truth tables. This name is used to filter the Parquet file
            when reading from disk.
        cycletime (str):
            Cycletime of a format similar to 20170109T0000Z.
        forecast_period (int):
            Forecast period to be calibrated in seconds.
        training_length (int):
            Number of days within the training period.
        distribution (str):
            The distribution that will be used for minimising the
            Continuous Ranked Probability Score when estimating the EMOS
            coefficients. This will be dependent upon the input phenomenon.
        point_by_point (bool):
            If True, coefficients are calculated independently for each point
            within the input cube by creating an initial guess and minimising
            each grid point independently. If False, a single set of
            coefficients is calculated using all points.
            Warning: This option is memory intensive and is unsuitable for
            gridded input. Using a default initial guess may reduce the memory
            overhead option.
        use_default_initial_guess (bool):
            If True, use the default initial guess. The default initial guess
            assumes no adjustments are required to the initial choice of
            predictor to generate the calibrated distribution. This means
            coefficients of 1 for the multiplicative coefficients and 0 for
            the additive coefficients. If False, the initial guess is computed.
        units (str):
            The units that calibration should be undertaken in. The historical
            forecast and truth will be converted as required.
        predictor (str):
            String to specify the form of the predictor used to calculate the
            location parameter when estimating the EMOS coefficients.
            Currently the ensemble mean ("mean") and the ensemble realizations
            ("realizations") are supported as options.
        tolerance (float):
            The tolerance for the Continuous Ranked Probability Score (CRPS)
            calculated by the minimisation. Once multiple iterations result in
            a CRPS equal to the same value within the specified tolerance, the
            minimisation will terminate.
        max_iterations (int):
            The maximum number of iterations allowed until the minimisation has
            converged to a stable solution. If the maximum number of iterations
            is reached but the minimisation has not yet converged to a stable
            solution, then the available solution is used anyway, and a warning
            is raised. If the predictor is "realizations", then the number of
            iterations may require increasing, as there will be more
            coefficients to solve.
        percentiles (List[float]):
            The set of percentiles to be used for estimating EMOS coefficients.
            These should be a set of equally spaced quantiles.
        experiment (str):
            A value within the experiment column to select from the forecast
            table.

    Returns:
        iris.cube.CubeList:
            CubeList containing the coefficients estimated using EMOS. Each
            coefficient is stored in a separate cube.
    """

    import iris
    import pandas as pd
    from iris.cube import CubeList

    from improver.calibration.dataframe_utilities import (
        forecast_and_truth_dataframes_to_cubes, )
    from improver.calibration.ensemble_calibration import (
        EstimateCoefficientsForEnsembleCalibration, )

    # Load forecasts from parquet file filtering by diagnostic and blend_time.
    forecast_period_td = pd.Timedelta(int(forecast_period), unit="seconds")
    # tz_localize(None) is used to facilitate filtering, although the dataframe
    # is expected to be timezone aware upon load.
    cycletimes = pd.date_range(
        end=pd.Timestamp(cycletime) - pd.Timedelta(1, unit="days") -
        forecast_period_td.floor("D"),
        periods=int(training_length),
        freq="D",
    ).tz_localize(None)
    filters = [[("diagnostic", "==", diagnostic),
                ("blend_time", "in", cycletimes)]]
    forecast_df = pd.read_parquet(forecast, filters=filters)

    # Load truths from parquet file filtering by diagnostic.
    filters = [[("diagnostic", "==", diagnostic)]]
    truth_df = pd.read_parquet(truth, filters=filters)
    if truth_df.empty:
        msg = (f"The requested filepath {truth} does not contain the "
               f"requested contents: {filters}")
        raise IOError(msg)

    forecast_cube, truth_cube = forecast_and_truth_dataframes_to_cubes(
        forecast_df,
        truth_df,
        cycletime,
        forecast_period,
        training_length,
        percentiles=percentiles,
        experiment=experiment,
    )

    if not forecast_cube or not truth_cube:
        return

    # Extract WMO IDs from the additional predictors.
    if additional_predictors:
        constr = iris.Constraint(wmo_id=truth_cube.coord("wmo_id").points)
        additional_predictors = CubeList(
            [ap.extract(constr) for ap in additional_predictors])

    plugin = EstimateCoefficientsForEnsembleCalibration(
        distribution,
        point_by_point=point_by_point,
        use_default_initial_guess=use_default_initial_guess,
        desired_units=units,
        predictor=predictor,
        tolerance=tolerance,
        max_iterations=max_iterations,
    )
    return plugin(forecast_cube,
                  truth_cube,
                  additional_fields=additional_predictors)