def MASE(p_real, p_pred, p_real_in, m=None, freq='1H'): """Function that computes the mean absolute scaled error (MASE) between two forecasts: .. math:: \\mathrm{MASE}_\\mathrm{m} = \\frac{1}{N}\\sum_{i=1}^N \\frac{\\bigl|p_\\mathrm{real}[i]−p_\\mathrm{pred}[i]\\bigr|} {\\mathrm{MAE}(p_\\mathrm{real\\_in}, p_\\mathrm{naive\\_in}, m)}. The numerator is the :class:`MAE` of a naive forecast ``Ynaive_in`` that is built using the insample dataset ``p_real_in`` and the :class:`naive_forecast` function with a seasonality index ``m``. If the datasets provided are numpy.ndarray objects, the function requires a ``freq`` argument specifying the data frequency. The ``freq`` argument must take one of the following four values ``'1H'`` for 1 hour, ``'30T'`` for 30 minutes, ``'15T'`` for 15 minutes, or ``'5T'`` for 5 minutes, (these are the four standard values in day-ahead electricity markets). Also, if the datasets provided are numpy.ndarray objects, ``m`` has to be 24 or 168, i.e. the :class:`naive_forecast` cannot be the standard in electricity price forecasting because the input data does not have associated a day of the week. ``p_real``, ``p_pred``, and `p_real_in`` can either be of shape :math:`(n_\\mathrm{days}, n_\\mathrm{prices/day})`, :math:`(n_\\mathrm{prices}, 1)`, or :math:`(n_\\mathrm{prices}, )` where :math:`n_\\mathrm{prices} = n_\\mathrm{days} \\cdot n_\\mathrm{prices/day}` Parameters ---------- p_real : numpy.ndarray, pandas.DataFrame Array/dataframe containing the real prices. p_pred : numpy.ndarray, pandas.DataFrame Array/dataframe containing the predicted prices. p_real_in : numpy.ndarray, pandas.DataFrame Insample dataset that is used to compute build a :class:`naive_forecast` and compute its :class:`MAE` m : int, optional Index that specifies the seasonality in the :class:`naive_forecast` used to compute the normalizing insample MAE. It can be be ``'D'`` for daily seasonality, ``'W'`` for weekly seasonality, or None for the standard naive forecast in electricity price forecasting, i.e. daily seasonality for Tuesday to Friday and weekly seasonality for Saturday to Monday. freq : str, optional Frequency of the data if ``p_real``, ``p_pred``, and ``p_real_in`` are numpy.ndarray objects. It must take one of the following four values ``'1H'`` for 1 hour, ``'30T'`` for 30 minutes, ``'15T'`` for 15 minutes, or ``'5T'`` for 5 minutes, (these are the four standard values in day-ahead electricity markets). Returns ------- float The mean absolute scaled error (MASE). Example ------- >>> from epftoolbox.evaluation import MASE >>> from epftoolbox.data import read_data >>> import pandas as pd >>> >>> # Download available forecast of the NP market available in the library repository >>> # These forecasts accompany the original paper >>> forecast = pd.read_csv('https://raw.githubusercontent.com/jeslago/epftoolbox/master/' + ... 'forecasts/Forecasts_NP_DNN_LEAR_ensembles.csv', index_col=0) >>> >>> # Transforming indices to datetime format >>> forecast.index = pd.to_datetime(forecast.index) >>> >>> # Reading data from the NP market >>> df_train, df_test = read_data(path='.', dataset='NP', begin_test_date=forecast.index[0], ... end_test_date=forecast.index[-1]) Test datasets: 2016-12-27 00:00:00 - 2018-12-24 23:00:00 >>> >>> # Extracting forecast of DNN ensemble and display >>> fc_DNN_ensemble = forecast.loc[:, ['DNN Ensemble']] >>> >>> # Extracting real price and display >>> real_price = df_test.loc[:, ['Price']] >>> real_price_insample = df_train.loc[:, ['Price']] >>> >>> # Building the same datasets with shape (ndays, n_prices/day) instead >>> # of shape (nprices, 1) and display >>> fc_DNN_ensemble_2D = pd.DataFrame(fc_DNN_ensemble.values.reshape(-1, 24), ... index=fc_DNN_ensemble.index[::24], ... columns=['h' + str(hour) for hour in range(24)]) >>> real_price_2D = pd.DataFrame(real_price.values.reshape(-1, 24), ... index=real_price.index[::24], ... columns=['h' + str(hour) for hour in range(24)]) >>> real_price_insample_2D = pd.DataFrame(real_price_insample.values.reshape(-1, 24), ... index=real_price_insample.index[::24], ... columns=['h' + str(hour) for hour in range(24)]) >>> >>> fc_DNN_ensemble_2D.head() h0 h1 h2 ... h21 h22 h23 2016-12-27 24.349676 23.127774 22.208617 ... 27.686771 27.045763 25.724071 2016-12-28 25.453866 24.707317 24.452384 ... 29.424558 28.627130 27.321902 2016-12-29 28.209516 27.715400 27.182692 ... 28.473288 27.926241 27.153401 2016-12-30 28.002935 27.467572 27.028558 ... 29.086532 28.518688 27.738548 2016-12-31 25.732282 24.668331 23.951569 ... 26.965008 26.450995 25.637346 >>> Let's test the metric for different conditions. >>> # Evaluating MASE when real price and forecasts are both dataframes >>> MASE(p_pred=fc_DNN_ensemble, p_real=real_price, ... p_real_in=real_price_insample, m='W') 0.5217886515713188 >>> >>> # Evaluating MASE when real price and forecasts are both numpy arrays >>> MASE(p_pred=fc_DNN_ensemble.values, p_real=real_price.values, ... p_real_in=real_price_insample.values, m='W', freq='1H') 0.5217886515713188 >>> >>> # Evaluating MASE when input values are of shape (ndays, n_prices/day) instead >>> # of shape (nprices, 1) >>> # Dataframes >>> MASE(p_pred=fc_DNN_ensemble_2D, p_real=real_price_2D, ... p_real_in=real_price_insample_2D, m='W') 0.5217886515713188 >>> # Numpy arrays >>> MASE(p_pred=fc_DNN_ensemble_2D.values, p_real=real_price_2D.values, ... p_real_in=real_price_insample_2D.values, m='W', freq='1H') 0.5217886515713188 >>> >>> # Evaluating MASE when input values are of shape (nprices,) >>> # instead of shape (nprices, 1) >>> # Pandas Series >>> MASE(p_pred=fc_DNN_ensemble.loc[:, 'DNN Ensemble'], ... p_real=real_price.loc[:, 'Price'], ... p_real_in=real_price_insample.loc[:, 'Price'], m='W') 0.5217886515713188 >>> # Numpy arrays >>> MASE(p_pred=fc_DNN_ensemble.values.squeeze(), ... p_real=real_price.values.squeeze(), ... p_real_in=real_price_insample.values.squeeze(), m='W', freq='1H') 0.5217886515713188 """ # Computing the MAE of the naive forecast # Pre-process prices to have the correct format p_real_in = _transform_input_prices_for_naive_forecast(p_real_in, m, freq) # Build naive forecast p_pred_naive = naive_forecast(p_real_in, m=m) # Select common time indices p_real_in = p_real_in.loc[p_pred_naive.index] # Computing naive MAE MAE_naive_train = MAE(p_real_in, p_pred_naive) # Checking if standard inputs are compatible p_real, p_pred = _process_inputs_for_metrics(p_real, p_pred) return np.mean(np.abs(p_real - p_pred) / MAE_naive_train)
def evaluate_lear_in_test_dataset(path_datasets_folder=os.path.join('.', 'datasets'), path_recalibration_folder=os.path.join('.', 'experimental_files'), dataset='PJM', years_test=2, calibration_window=364 * 3, begin_test_date=None, end_test_date=None): """Function for easy evaluation of the LEAR model in a test dataset using daily recalibration. The test dataset is defined by a market name and the test dates dates. The function generates the test and training datasets, and evaluates a LEAR model considering daily recalibration. An example on how to use this function is provided :ref:`here<learex1>`. Parameters ---------- path_datasets_folder : str, optional path where the datasets are stored or, if they do not exist yet, the path where the datasets are to be stored. path_recalibration_folder : str, optional path to save the files of the experiment dataset. dataset : str, optional Name of the dataset/market under study. If it is one one of the standard markets, i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name is different, a dataset with a csv format should be place in the ``path_datasets_folder``. years_test : int, optional Number of years (a year is 364 days) in the test dataset. It is only used if the arguments ``begin_test_date`` and ``end_test_date`` are not provided. calibration_window : int, optional Number of days used in the training dataset for recalibration. begin_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument ``end_test_date``. If either of them is not provided, the test dataset is built using the ``years_test`` argument. ``begin_test_date`` should either be a string with the following format ``"%d/%m/%Y %H:%M"``, or a datetime object. end_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument ``begin_test_date``. If either of them is not provided, the test dataset is built using the ``years_test`` argument. ``end_test_date`` should either be a string with the following format ``"%d/%m/%Y %H:%M"``, or a datetime object. Returns ------- pandas.DataFrame A dataframe with all the predictions in the test dataset. The dataframe is also written to path_recalibration_folder. """ # Checking if provided directory for recalibration exists and if not create it if not os.path.exists(path_recalibration_folder): os.makedirs(path_recalibration_folder) # Defining train and testing data df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder, begin_test_date=begin_test_date, end_test_date=end_test_date) # Defining unique name to save the forecast forecast_file_name = 'LEAR_forecast' + '_dat' + str(dataset) + '_YT' + str(years_test) + \ '_CW' + str(calibration_window) + '.csv' forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name) # Defining empty forecast array and the real values to be predicted in a more friendly format forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)]) real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24) real_values = pd.DataFrame(real_values, index=forecast.index, columns=forecast.columns) forecast_dates = forecast.index model = LEAR(calibration_window=calibration_window) # For loop over the recalibration dates for date in forecast_dates: # For simulation purposes, we assume that the available data is # the data up to current date where the prices of current date are not known data_available = pd.concat([df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0) # We set the real prices for current date to NaN in the dataframe of available data data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN # Recalibrating the model with the most up-to-date available data and making a prediction # for the next day Yp = model.recalibrate_and_forecast_next_day(df=data_available, next_day_date=date, calibration_window=calibration_window) # Saving the current prediction forecast.loc[date, :] = Yp # Computing metrics up-to-current-date mae = np.mean(MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) smape = np.mean(sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100 # Pringint information print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format(str(date)[:10], smape, mae)) # Saving forecast forecast.to_csv(forecast_file_path) return forecast
# Building the same datasets with shape (ndays, n_prices/day) # instead of shape (nprices, 1) and display fc_DNN_ensemble_2D = pd.DataFrame( fc_DNN_ensemble.values.reshape(-1, 24), index=fc_DNN_ensemble.index[::24], columns=['h' + str(hour) for hour in range(24)]) real_price_2D = pd.DataFrame(real_price.values.reshape(-1, 24), index=real_price.index[::24], columns=['h' + str(hour) for hour in range(24)]) fc_DNN_ensemble_2D.head() # According to the paper, the MAE of the DNN ensemble for the NP market is 1.667 # Let's test the metric for different conditions # Evaluating MAE when real price and forecasts are both dataframes MAE(p_pred=fc_DNN_ensemble, p_real=real_price) # Evaluating MAE when real price and forecasts are both numpy arrays MAE(p_pred=fc_DNN_ensemble.values, p_real=real_price.values) # Evaluating MAE when input values are of shape (ndays, n_prices/day) # instead of shape (nprices, 1) # Dataframes MAE(p_pred=fc_DNN_ensemble_2D, p_real=real_price_2D) # Numpy arrays MAE(p_pred=fc_DNN_ensemble_2D.values, p_real=real_price_2D.values) # Evaluating MAE when input values are of shape (nprices,) # instead of shape (nprices, 1) # Pandas Series MAE(p_pred=fc_DNN_ensemble.loc[:, 'DNN Ensemble'],