Esempio n. 1
0
def MASE(p_real, p_pred, p_real_in, m=None, freq='1H'):

    """Function that computes the mean absolute scaled error (MASE) between two forecasts:
    
    .. math:: 
        \\mathrm{MASE}_\\mathrm{m} = \\frac{1}{N}\\sum_{i=1}^N 
                         \\frac{\\bigl|p_\\mathrm{real}[i]−p_\\mathrm{pred}[i]\\bigr|}
                         {\\mathrm{MAE}(p_\\mathrm{real\\_in}, p_\\mathrm{naive\\_in}, m)}.
    
    The numerator is the :class:`MAE` of a naive forecast ``Ynaive_in`` that is built using the insample
    dataset ``p_real_in`` and the :class:`naive_forecast` function with a seasonality index ``m``.

    If the datasets provided are numpy.ndarray objects, the function requires a ``freq`` argument specifying
    the data frequency. The ``freq`` argument must take one of the following four values ``'1H'`` for 1 hour,
    ``'30T'`` for 30 minutes, ``'15T'`` for 15 minutes, or ``'5T'`` for 5 minutes,  (these are the 
    four standard values in day-ahead electricity markets). 
    
    Also, if the datasets provided are numpy.ndarray objects, ``m`` has to be 24 or 168, i.e. the 
    :class:`naive_forecast` cannot be the standard in electricity price forecasting because the input
    data does not have associated a day of the week.
    
    ``p_real``, ``p_pred``, and  `p_real_in`` can either be of shape 
    :math:`(n_\\mathrm{days}, n_\\mathrm{prices/day})`,
    :math:`(n_\\mathrm{prices}, 1)`, or :math:`(n_\\mathrm{prices}, )` where
    :math:`n_\\mathrm{prices} = n_\\mathrm{days} \\cdot n_\\mathrm{prices/day}`


    Parameters
    ----------
    p_real : numpy.ndarray, pandas.DataFrame
        Array/dataframe containing the real prices. 
    p_pred : numpy.ndarray, pandas.DataFrame
        Array/dataframe containing the predicted prices. 
    p_real_in : numpy.ndarray, pandas.DataFrame
        Insample dataset that is used to compute build a :class:`naive_forecast` and compute its :class:`MAE`
    m : int, optional
        Index that specifies the seasonality in the :class:`naive_forecast` used to compute the normalizing
        insample MAE. It can be be ``'D'`` for daily seasonality, ``'W'`` for weekly seasonality, or None
        for the standard naive forecast in electricity price forecasting, 
        i.e. daily seasonality for Tuesday to Friday and weekly seasonality 
        for Saturday to Monday.    
    freq : str, optional
        Frequency of the data if ``p_real``, ``p_pred``, and ``p_real_in`` are numpy.ndarray objects.
        It must take one of the following four values ``'1H'`` for 1 hour, ``'30T'`` for 30 minutes, 
        ``'15T'`` for 15 minutes, or ``'5T'`` for 5 minutes,  (these are the four standard values in 
        day-ahead electricity markets). 
    Returns
    -------
    float
        The mean absolute scaled error (MASE).

    Example
    -------
    >>> from epftoolbox.evaluation import MASE
    >>> from epftoolbox.data import read_data
    >>> import pandas as pd
    >>> 
    >>> # Download available forecast of the NP market available in the library repository
    >>> # These forecasts accompany the original paper
    >>> forecast = pd.read_csv('https://raw.githubusercontent.com/jeslago/epftoolbox/master/' + 
    ...                       'forecasts/Forecasts_NP_DNN_LEAR_ensembles.csv', index_col=0)
    >>> 
    >>> # Transforming indices to datetime format
    >>> forecast.index = pd.to_datetime(forecast.index)
    >>> 
    >>> # Reading data from the NP market
    >>> df_train, df_test = read_data(path='.', dataset='NP', begin_test_date=forecast.index[0], 
    ...                        end_test_date=forecast.index[-1])
    Test datasets: 2016-12-27 00:00:00 - 2018-12-24 23:00:00
    >>> 
    >>> # Extracting forecast of DNN ensemble and display
    >>> fc_DNN_ensemble = forecast.loc[:, ['DNN Ensemble']]
    >>> 
    >>> # Extracting real price and display
    >>> real_price = df_test.loc[:, ['Price']]
    >>> real_price_insample = df_train.loc[:, ['Price']]
    >>> 
    >>> # Building the same datasets with shape (ndays, n_prices/day) instead 
    >>> # of shape (nprices, 1) and display
    >>> fc_DNN_ensemble_2D = pd.DataFrame(fc_DNN_ensemble.values.reshape(-1, 24), 
    ...                                   index=fc_DNN_ensemble.index[::24], 
    ...                                   columns=['h' + str(hour) for hour in range(24)])
    >>> real_price_2D = pd.DataFrame(real_price.values.reshape(-1, 24), 
    ...                              index=real_price.index[::24], 
    ...                              columns=['h' + str(hour) for hour in range(24)])
    >>> real_price_insample_2D = pd.DataFrame(real_price_insample.values.reshape(-1, 24), 
    ...                              index=real_price_insample.index[::24], 
    ...                              columns=['h' + str(hour) for hour in range(24)])
    >>> 
    >>> fc_DNN_ensemble_2D.head()
                       h0         h1         h2  ...        h21        h22        h23
    2016-12-27  24.349676  23.127774  22.208617  ...  27.686771  27.045763  25.724071
    2016-12-28  25.453866  24.707317  24.452384  ...  29.424558  28.627130  27.321902
    2016-12-29  28.209516  27.715400  27.182692  ...  28.473288  27.926241  27.153401
    2016-12-30  28.002935  27.467572  27.028558  ...  29.086532  28.518688  27.738548
    2016-12-31  25.732282  24.668331  23.951569  ...  26.965008  26.450995  25.637346
    >>> 
     
    Let's test the metric for different conditions.
     
    >>> # Evaluating MASE when real price and forecasts are both dataframes
    >>> MASE(p_pred=fc_DNN_ensemble, p_real=real_price, 
    ...      p_real_in=real_price_insample, m='W')
    0.5217886515713188
    >>> 
    >>> # Evaluating MASE when real price and forecasts are both numpy arrays
    >>> MASE(p_pred=fc_DNN_ensemble.values, p_real=real_price.values, 
    ...      p_real_in=real_price_insample.values, m='W', freq='1H')
    0.5217886515713188
    >>> 
    >>> # Evaluating MASE when input values are of shape (ndays, n_prices/day) instead 
    >>> # of shape (nprices, 1)
    >>> # Dataframes
    >>> MASE(p_pred=fc_DNN_ensemble_2D, p_real=real_price_2D, 
    ...      p_real_in=real_price_insample_2D, m='W')
    0.5217886515713188
    >>> # Numpy arrays
    >>> MASE(p_pred=fc_DNN_ensemble_2D.values, p_real=real_price_2D.values, 
    ...      p_real_in=real_price_insample_2D.values, m='W', freq='1H')
    0.5217886515713188
    >>> 
    >>> # Evaluating MASE when input values are of shape (nprices,) 
    >>> # instead of shape (nprices, 1)
    >>> # Pandas Series
    >>> MASE(p_pred=fc_DNN_ensemble.loc[:, 'DNN Ensemble'], 
    ...      p_real=real_price.loc[:, 'Price'],
    ...      p_real_in=real_price_insample.loc[:, 'Price'], m='W')
    0.5217886515713188
    >>> # Numpy arrays
    >>> MASE(p_pred=fc_DNN_ensemble.values.squeeze(), 
    ...      p_real=real_price.values.squeeze(), 
    ...      p_real_in=real_price_insample.values.squeeze(), m='W', freq='1H')
    0.5217886515713188

    """

    # Computing the MAE of the naive forecast
    # Pre-process prices to have the correct format
    p_real_in = _transform_input_prices_for_naive_forecast(p_real_in, m, freq)
    # Build naive forecast
    p_pred_naive = naive_forecast(p_real_in, m=m)
    # Select common time indices
    p_real_in = p_real_in.loc[p_pred_naive.index]
    # Computing naive MAE
    MAE_naive_train = MAE(p_real_in, p_pred_naive)

    # Checking if standard inputs are compatible
    p_real, p_pred = _process_inputs_for_metrics(p_real, p_pred)

    return np.mean(np.abs(p_real - p_pred) / MAE_naive_train)
Esempio n. 2
0
def evaluate_lear_in_test_dataset(path_datasets_folder=os.path.join('.', 'datasets'), 
                                  path_recalibration_folder=os.path.join('.', 'experimental_files'),
                                  dataset='PJM', years_test=2, calibration_window=364 * 3, 
                                  begin_test_date=None, end_test_date=None):
    """Function for easy evaluation of the LEAR model in a test dataset using daily recalibration. 
    
    The test dataset is defined by a market name and the test dates dates. The function
    generates the test and training datasets, and evaluates a LEAR model considering daily recalibration. 
    
    An example on how to use this function is provided :ref:`here<learex1>`.   

    Parameters
    ----------
    path_datasets_folder : str, optional
        path where the datasets are stored or, if they do not exist yet,
        the path where the datasets are to be stored.
    
    path_recalibration_folder : str, optional
        path to save the files of the experiment dataset.
    
    dataset : str, optional
        Name of the dataset/market under study. If it is one one of the standard markets, 
        i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name
        is different, a dataset with a csv format should be place in the ``path_datasets_folder``.

    years_test : int, optional
        Number of years (a year is 364 days) in the test dataset. It is only used if 
        the arguments ``begin_test_date`` and ``end_test_date`` are not provided.
    
    calibration_window : int, optional
        Number of days used in the training dataset for recalibration.
    
    begin_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        ``end_test_date``. If either of them is not provided, the test dataset is built using the 
        ``years_test`` argument. ``begin_test_date`` should either be a string with the following 
        format ``"%d/%m/%Y %H:%M"``, or a datetime object.
    
    end_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        ``begin_test_date``. If either of them is not provided, the test dataset is built using the 
        ``years_test`` argument. ``end_test_date`` should either be a string with the following 
        format ``"%d/%m/%Y %H:%M"``, or a datetime object.       
    
    Returns
    -------
    pandas.DataFrame
        A dataframe with all the predictions in the test dataset. The dataframe is also written to path_recalibration_folder.
    """

    # Checking if provided directory for recalibration exists and if not create it
    if not os.path.exists(path_recalibration_folder):
        os.makedirs(path_recalibration_folder)

    # Defining train and testing data
    df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder,
                                  begin_test_date=begin_test_date, end_test_date=end_test_date)

    # Defining unique name to save the forecast
    forecast_file_name = 'LEAR_forecast' + '_dat' + str(dataset) + '_YT' + str(years_test) + \
                         '_CW' + str(calibration_window) + '.csv'

    forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name)


    # Defining empty forecast array and the real values to be predicted in a more friendly format
    forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)])
    real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24)
    real_values = pd.DataFrame(real_values, index=forecast.index, columns=forecast.columns)

    forecast_dates = forecast.index

    model = LEAR(calibration_window=calibration_window)

    # For loop over the recalibration dates
    for date in forecast_dates:

        # For simulation purposes, we assume that the available data is
        # the data up to current date where the prices of current date are not known
        data_available = pd.concat([df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0)

        # We set the real prices for current date to NaN in the dataframe of available data
        data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN

        # Recalibrating the model with the most up-to-date available data and making a prediction
        # for the next day
        Yp = model.recalibrate_and_forecast_next_day(df=data_available, next_day_date=date, 
                                                     calibration_window=calibration_window)
        # Saving the current prediction
        forecast.loc[date, :] = Yp

        # Computing metrics up-to-current-date
        mae = np.mean(MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) 
        smape = np.mean(sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100

        # Pringint information
        print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(str(date)[:10], smape, mae))

        # Saving forecast
        forecast.to_csv(forecast_file_path)

    return forecast
Esempio n. 3
0
# Building the same datasets with shape (ndays, n_prices/day)
# instead of shape (nprices, 1) and display
fc_DNN_ensemble_2D = pd.DataFrame(
    fc_DNN_ensemble.values.reshape(-1, 24),
    index=fc_DNN_ensemble.index[::24],
    columns=['h' + str(hour) for hour in range(24)])
real_price_2D = pd.DataFrame(real_price.values.reshape(-1, 24),
                             index=real_price.index[::24],
                             columns=['h' + str(hour) for hour in range(24)])
fc_DNN_ensemble_2D.head()

# According to the paper, the MAE of the DNN ensemble for the NP market is 1.667
# Let's test the metric for different conditions

# Evaluating MAE when real price and forecasts are both dataframes
MAE(p_pred=fc_DNN_ensemble, p_real=real_price)

# Evaluating MAE when real price and forecasts are both numpy arrays
MAE(p_pred=fc_DNN_ensemble.values, p_real=real_price.values)

# Evaluating MAE when input values are of shape (ndays, n_prices/day)
# instead of shape (nprices, 1)
# Dataframes
MAE(p_pred=fc_DNN_ensemble_2D, p_real=real_price_2D)
# Numpy arrays
MAE(p_pred=fc_DNN_ensemble_2D.values, p_real=real_price_2D.values)

# Evaluating MAE when input values are of shape (nprices,)
# instead of shape (nprices, 1)
# Pandas Series
MAE(p_pred=fc_DNN_ensemble.loc[:, 'DNN Ensemble'],