from epftoolbox.evaluation import rMAE from epftoolbox.data import read_data import pandas as pd # Download available forecast of the NP market available in the library repository # These forecasts accompany the original paper forecast = pd.read_csv('https://raw.githubusercontent.com/jeslago/epftoolbox/master/' + 'forecasts/Forecasts_NP_DNN_LEAR_ensembles.csv', index_col=0) # Transforming indices to datetime format forecast.index = pd.to_datetime(forecast.index) # Reading data from the NP market _, df_test = read_data(path='.', dataset='NP', begin_test_date=forecast.index[0], end_test_date=forecast.index[-1]) # Extracting forecast of DNN ensemble and display fc_DNN_ensemble = forecast.loc[:, ['DNN Ensemble']] # Extracting real price and display real_price = df_test.loc[:, ['Price']] # Building the same datasets with shape (ndays, n_prices/day) instead # of shape (nprices, 1) and display fc_DNN_ensemble_2D = pd.DataFrame(fc_DNN_ensemble.values.reshape(-1, 24), index=fc_DNN_ensemble.index[::24], columns=['h' + str(hour) for hour in range(24)]) real_price_2D = pd.DataFrame(real_price.values.reshape(-1, 24), index=real_price.index[::24], columns=['h' + str(hour) for hour in range(24)]) fc_DNN_ensemble_2D.head()
shuffle_train = args.shuffle_train data_augmentation = args.data_augmentation new_recalibration = args.new_recalibration calibration_window = args.calibration_window experiment_id = args.experiment_id begin_test_date = args.begin_test_date end_test_date = args.end_test_date path_datasets_folder = os.path.join('.', 'datasets') path_recalibration_folder = os.path.join('.', 'experimental_files') path_hyperparameter_folder = os.path.join('.', 'experimental_files') # Defining train and testing data df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder, begin_test_date=begin_test_date, end_test_date=end_test_date) # Defining unique name to save the forecast forecast_file_name = 'fc_nl' + str(nlayers) + '_dat' + str(dataset) + \ '_YT' + str(years_test) + '_SF' + str(shuffle_train) + \ '_DA' * data_augmentation + '_CW' + str(calibration_window) + \ '_' + str(experiment_id) + '.csv' forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name) # Defining empty forecast array and the real values to be predicted in a more friendly format forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)])
def hyperparameter_optimizer(path_datasets_folder=os.path.join('.', 'datasets'), path_hyperparameters_folder=os.path.join('.', 'experimental_files'), new_hyperopt=1, max_evals=1500, nlayers=2, dataset='PJM', years_test=2, calibration_window=4, shuffle_train=1, data_augmentation=0, experiment_id=None, begin_test_date=None, end_test_date=None): """Function to optimize the hyperparameters and input features of the DNN. An example on how to use this function is provided :ref:`here<dnnex1>`. Parameters ---------- path_datasets_folder : str, optional Path to read and store datasets. path_hyperparameters_folder : str, optional Path to read and store trials files from hyperopt. new_hyperopt : bool, optional Boolean that decides whether to start a new hyperparameter optimization or re-start an existing one. max_evals : int, optional Maximum number of iterations for hyperopt. nlayers : int, optional Number of layers of the DNN model. dataset : str, optional Name of the dataset/market under study. If it is one one of the standard markets, i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name is different, a dataset with a csv format should be place in the ``path_datasets_folder``. years_test : int, optional Number of years (a year is 364 days) in the test dataset. It is only used if the arguments ``begin_test_date`` and ``end_test_date`` are not provided. calibration_window : int, optional Calibration window used for training the models. shuffle_train : bool, optional Boolean that selects whether the validation and training datasets are shuffled. Based on empirical results, this configuration does not play a role when selecting the hyperparameters and features. However, it is important when recalibrating the DNN model. data_augmentation : bool, optional Boolean that selects whether a data augmentation technique for DNNs is used. Based on empirical results, for some markets data augmentation might improve forecasting accuracy at the expense of higher computational costs. experiment_id : None, optional Unique identifier to save/read the trials file. If not provided, the current date is used as identifier. begin_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument ``end_test_date``. If either of them is not provided, the test dataset is built using the ``years_test`` argument. ``begin_test_date`` should either be a string with the following format ``"%d/%m/%Y %H:%M"``, or a datetime object. end_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument ``begin_test_date``. If either of them is not provided, the test dataset is built using the ``years_test`` argument. ``end_test_date`` should either be a string with the following format ``"%d/%m/%Y %H:%M"``, or a datetime object. """ # Checking if provided directory for hyperparameter exists and if not create it if not os.path.exists(path_hyperparameters_folder): os.makedirs(path_hyperparameters_folder) if experiment_id is None: experiment_id = datetime.now().strftime("%d-%m-%Y_%H:%M:%S") else: experiment_id = experiment_id # Defining unique trials file name (this is an unique identifier) trials_file_name = 'DNN_hyperparameters_nl' + str(nlayers) + '_dat' + str(dataset) + \ '_YT' + str(years_test) + '_SF' * (shuffle_train) + \ '_DA' * (data_augmentation) + '_CW' + str(calibration_window) + \ '_' + str(experiment_id) trials_file_path = os.path.join(path_hyperparameters_folder, trials_file_name) # If hyperparameter optimization starts from scratch, new trials object is created. If not, # we read existing trials object if new_hyperopt: trials = Trials() else: trials = pc.load(open(trials_file_path, "rb")) # Generate training and test datasets dfTrain, dfTest = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder, begin_test_date=begin_test_date, end_test_date=end_test_date) n_exogenous_inputs = len(dfTrain.columns) - 1 # Build hyperparamerter search space. This includes hyperparameter and features space = _build_space(nlayers, data_augmentation, n_exogenous_inputs) # Perform hyperparameter optimization fmin_objective = partial(_hyperopt_objective, trials=trials, trials_file_path=trials_file_path, max_evals=max_evals, nlayers=nlayers, dfTrain=dfTrain, dfTest=dfTest, shuffle_train=shuffle_train, dataset=dataset, data_augmentation=data_augmentation, calibration_window=calibration_window, n_exogenous_inputs=n_exogenous_inputs) fmin(fmin_objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials, verbose=False)
def evaluate_dnn_in_test_dataset( experiment_id, path_datasets_folder=os.path.join('.', 'datasets'), path_hyperparameter_folder=os.path.join('.', 'experimental_files'), path_recalibration_folder=os.path.join('.', 'experimental_files'), nlayers=2, dataset='PJM', years_test=2, shuffle_train=True, data_augmentation=0, calibration_window=4, new_recalibration=False, begin_test_date=None, end_test_date=None): """Function for easy evaluation of the DNN model in a test dataset using daily recalibration. The test dataset is defined by a market name and the test dates dates. The function generates the test and training datasets, and evaluates a DNN model considering daily recalibration and an optimal set of hyperparameters. Note that before using this class, a hyperparameter optimization run must be done using the :class:`hyperparameter_optimizer` function. Moreover, the hyperparameter optimization must be done using the same parameters: ``nlayers``, ``dataset``, ``shuffle_train``, ``data_augmentation``, ``calibration_window``, and either the ``years_test`` or the same ``begin_test_date``/``end_test_date`` An example on how to use this function is provided :ref:`here<dnnex2>`. Parameters ---------- experiment_id : str Unique identifier to read the trials file. In particular, every hyperparameter optimization set has an unique identifier associated with. See :class:`hyperparameter_optimizer` for further details path_datasets_folder : str, optional Path where the datasets are stored or, if they do not exist yet, the path where the datasets are to be stored path_hyperparameter_folder : str, optional Path of the folder containing the trials file with the optimal hyperparameters path_recalibration_folder : str, optional Path to save the forecast of the test dataset nlayers : int, optional Number of hidden layers in the neural network dataset : str, optional Name of the dataset/market under study. If it is one one of the standard markets, i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name is different, a dataset with a csv format should be place in the ``path_datasets_folder``. years_test : int, optional Number of years (a year is 364 days) in the test dataset. It is only used if the arguments begin_test_date and end_test_date are not provided. begin_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument end_test_date. If either of them is not provided, the test dataset is built using the years_test argument. begin_test_date should either be a string with the following format d/m/Y H:M, or a datetime object end_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument begin_test_date. If either of them is not provided, the test dataset is built using the years_test argument. end_test_date should either be a string with the following format d/m/Y H:M, or a datetime object shuffle_train : bool, optional Boolean that selects whether the validation and training datasets were shuffled when performing the hyperparameter optimization. Note that it does not select whether shuffling is used for recalibration as for recalibration the validation and the training datasets are always shuffled. data_augmentation : bool, optional Boolean that selects whether a data augmentation technique for electricity price forecasting is employed calibration_window : int, optional Number of days used in the training/validation dataset for recalibration new_recalibration : bool, optional Boolean that selects whether a new recalibration is performed or the function re-starts an old one. To restart an old one, the .csv file with the forecast must exist in the ``path_recalibration_folder`` folder Returns ------- pandas.DataFrame A dataframe with all the predictions in the test dataset. The dataframe is also written to the folder ``path_recalibration_folder`` """ # Checking if provided directory for recalibration exists and if not create it if not os.path.exists(path_recalibration_folder): os.makedirs(path_recalibration_folder) # Defining train and testing data df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder, begin_test_date=begin_test_date, end_test_date=end_test_date) # Defining unique name to save the forecast forecast_file_name = 'DNN_forecast_nl' + str(nlayers) + '_dat' + str(dataset) + \ '_YT' + str(years_test) + '_SFH' + str(shuffle_train) + \ '_DA' * data_augmentation + '_CW' + str(calibration_window) + \ '_' + str(experiment_id) + '.csv' forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name) # Defining empty forecast array and the real values to be predicted in a more friendly format forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)]) real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24) real_values = pd.DataFrame(real_values, index=forecast.index, columns=forecast.columns) # If we are not starting a new recalibration but re-starting an old one, we import the # existing files and print metrics if not new_recalibration: # Import existinf forecasting file forecast = pd.read_csv(forecast_file_path, index_col=0) forecast.index = pd.to_datetime(forecast.index) # Reading dates to still be forecasted by checking NaN values forecast_dates = forecast[forecast.isna().any(axis=1)].index # If all the dates to be forecasted have already been forecast, we print information # and exit the script if len(forecast_dates) == 0: mae = np.mean(MAE(forecast.values.squeeze(), real_values.values)) smape = np.mean( sMAPE(forecast.values.squeeze(), real_values.values)) * 100 print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format( 'Final metrics', smape, mae)) else: forecast_dates = forecast.index model = DNN(experiment_id=experiment_id, path_hyperparameter_folder=path_hyperparameter_folder, nlayers=nlayers, dataset=dataset, years_test=years_test, shuffle_train=shuffle_train, data_augmentation=data_augmentation, calibration_window=calibration_window) # For loop over the recalibration dates for date in forecast_dates: # For simulation purposes, we assume that the available data is # the data up to current date where the prices of current date are not known data_available = pd.concat( [df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0) # We set the real prices for current date to NaN in the dataframe of available data data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN # Recalibrating the model with the most up-to-date available data and making a prediction # for the next day Yp = model.recalibrate_and_forecast_next_day(df=data_available, next_day_date=date) # Saving the current prediction forecast.loc[date, :] = Yp # Computing metrics up-to-current-date mae = np.mean( MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) smape = np.mean( sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100 # Pringint information print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format( str(date)[:10], smape, mae)) # Saving forecast forecast.to_csv(forecast_file_path) return forecast
def evaluate_lear_in_test_dataset(path_datasets_folder=os.path.join('.', 'datasets'), path_recalibration_folder=os.path.join('.', 'experimental_files'), dataset='PJM', years_test=2, calibration_window=364 * 3, begin_test_date=None, end_test_date=None): """Function for easy evaluation of the LEAR model in a test dataset using daily recalibration. The test dataset is defined by a market name and the test dates dates. The function generates the test and training datasets, and evaluates a LEAR model considering daily recalibration. An example on how to use this function is provided :ref:`here<learex1>`. Parameters ---------- path_datasets_folder : str, optional path where the datasets are stored or, if they do not exist yet, the path where the datasets are to be stored. path_recalibration_folder : str, optional path to save the files of the experiment dataset. dataset : str, optional Name of the dataset/market under study. If it is one one of the standard markets, i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name is different, a dataset with a csv format should be place in the ``path_datasets_folder``. years_test : int, optional Number of years (a year is 364 days) in the test dataset. It is only used if the arguments ``begin_test_date`` and ``end_test_date`` are not provided. calibration_window : int, optional Number of days used in the training dataset for recalibration. begin_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument ``end_test_date``. If either of them is not provided, the test dataset is built using the ``years_test`` argument. ``begin_test_date`` should either be a string with the following format ``"%d/%m/%Y %H:%M"``, or a datetime object. end_test_date : datetime/str, optional Optional parameter to select the test dataset. Used in combination with the argument ``begin_test_date``. If either of them is not provided, the test dataset is built using the ``years_test`` argument. ``end_test_date`` should either be a string with the following format ``"%d/%m/%Y %H:%M"``, or a datetime object. Returns ------- pandas.DataFrame A dataframe with all the predictions in the test dataset. The dataframe is also written to path_recalibration_folder. """ # Checking if provided directory for recalibration exists and if not create it if not os.path.exists(path_recalibration_folder): os.makedirs(path_recalibration_folder) # Defining train and testing data df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder, begin_test_date=begin_test_date, end_test_date=end_test_date) # Defining unique name to save the forecast forecast_file_name = 'LEAR_forecast' + '_dat' + str(dataset) + '_YT' + str(years_test) + \ '_CW' + str(calibration_window) + '.csv' forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name) # Defining empty forecast array and the real values to be predicted in a more friendly format forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)]) real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24) real_values = pd.DataFrame(real_values, index=forecast.index, columns=forecast.columns) forecast_dates = forecast.index model = LEAR(calibration_window=calibration_window) # For loop over the recalibration dates for date in forecast_dates: # For simulation purposes, we assume that the available data is # the data up to current date where the prices of current date are not known data_available = pd.concat([df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0) # We set the real prices for current date to NaN in the dataframe of available data data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN # Recalibrating the model with the most up-to-date available data and making a prediction # for the next day Yp = model.recalibrate_and_forecast_next_day(df=data_available, next_day_date=date, calibration_window=calibration_window) # Saving the current prediction forecast.loc[date, :] = Yp # Computing metrics up-to-current-date mae = np.mean(MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) smape = np.mean(sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100 # Pringint information print('{} - sMAPE: {:.2f}% | MAE: {:.3f}'.format(str(date)[:10], smape, mae)) # Saving forecast forecast.to_csv(forecast_file_path) return forecast