Python read_data Exemples, epftoolbox.data.read_data Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : using_rmae.py Projet : lsikora1/epftoolbox

from epftoolbox.evaluation import rMAE
from epftoolbox.data import read_data
import pandas as pd

# Download available forecast of the NP market available in the library repository
# These forecasts accompany the original paper
forecast = pd.read_csv('https://raw.githubusercontent.com/jeslago/epftoolbox/master/' + 
                      'forecasts/Forecasts_NP_DNN_LEAR_ensembles.csv', index_col=0)

# Transforming indices to datetime format
forecast.index = pd.to_datetime(forecast.index)

# Reading data from the NP market
_, df_test = read_data(path='.', dataset='NP', begin_test_date=forecast.index[0], 
                       end_test_date=forecast.index[-1])

# Extracting forecast of DNN ensemble and display
fc_DNN_ensemble = forecast.loc[:, ['DNN Ensemble']]

# Extracting real price and display
real_price = df_test.loc[:, ['Price']]

# Building the same datasets with shape (ndays, n_prices/day) instead 
# of shape (nprices, 1) and display
fc_DNN_ensemble_2D = pd.DataFrame(fc_DNN_ensemble.values.reshape(-1, 24), 
                                  index=fc_DNN_ensemble.index[::24], 
                                  columns=['h' + str(hour) for hour in range(24)])
real_price_2D = pd.DataFrame(real_price.values.reshape(-1, 24), 
                             index=real_price.index[::24], 
                             columns=['h' + str(hour) for hour in range(24)])
fc_DNN_ensemble_2D.head()

Exemple #2

0

Afficher le fichier

shuffle_train = args.shuffle_train
data_augmentation = args.data_augmentation
new_recalibration = args.new_recalibration
calibration_window = args.calibration_window
experiment_id = args.experiment_id
begin_test_date = args.begin_test_date
end_test_date = args.end_test_date

path_datasets_folder = os.path.join('.', 'datasets')
path_recalibration_folder = os.path.join('.', 'experimental_files')
path_hyperparameter_folder = os.path.join('.', 'experimental_files')

# Defining train and testing data
df_train, df_test = read_data(dataset=dataset,
                              years_test=years_test,
                              path=path_datasets_folder,
                              begin_test_date=begin_test_date,
                              end_test_date=end_test_date)

# Defining unique name to save the forecast
forecast_file_name = 'fc_nl' + str(nlayers) + '_dat' + str(dataset) + \
                   '_YT' + str(years_test) + '_SF' + str(shuffle_train) + \
                   '_DA' * data_augmentation + '_CW' + str(calibration_window) + \
                   '_' + str(experiment_id) + '.csv'

forecast_file_path = os.path.join(path_recalibration_folder,
                                  forecast_file_name)

# Defining empty forecast array and the real values to be predicted in a more friendly format
forecast = pd.DataFrame(index=df_test.index[::24],
                        columns=['h' + str(k) for k in range(24)])

Exemple #3

0

Afficher le fichier

def hyperparameter_optimizer(path_datasets_folder=os.path.join('.', 'datasets'), 
                             path_hyperparameters_folder=os.path.join('.', 'experimental_files'), 
                             new_hyperopt=1, max_evals=1500, nlayers=2, dataset='PJM', years_test=2, 
                             calibration_window=4, shuffle_train=1, data_augmentation=0,
                             experiment_id=None, begin_test_date=None, end_test_date=None):
    
    """Function to optimize the hyperparameters and input features of the DNN. An example on how to 
    use this function is provided :ref:`here<dnnex1>`.
    
    Parameters
    ----------
    path_datasets_folder : str, optional
        Path to read and store datasets.
    
    path_hyperparameters_folder : str, optional
        Path to read and store trials files from hyperopt.
    
    new_hyperopt : bool, optional
        Boolean that decides whether to start a new hyperparameter optimization or re-start an
        existing one.
    
    max_evals : int, optional
        Maximum number of iterations for hyperopt.
    
    nlayers : int, optional
        Number of layers of the DNN model.
    
    dataset : str, optional
        Name of the dataset/market under study. If it is one one of the standard markets, 
        i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name
        is different, a dataset with a csv format should be place in the ``path_datasets_folder``.
    
    years_test : int, optional
        Number of years (a year is 364 days) in the test dataset. It is only used if 
        the arguments ``begin_test_date`` and ``end_test_date`` are not provided.
    
    calibration_window : int, optional
        Calibration window used for training the models.
    
    shuffle_train : bool, optional
        Boolean that selects whether the validation and training datasets
        are shuffled. Based on empirical results, this configuration does not play a role
        when selecting the hyperparameters and features. However, it is important when recalibrating
        the DNN model.
    
    data_augmentation : bool, optional
        Boolean that selects whether a data augmentation technique 
        for DNNs is used. Based on empirical results, for some markets data augmentation might
        improve forecasting accuracy at the expense of higher computational costs.
    
    experiment_id : None, optional
        Unique identifier to save/read the trials file. If not
        provided, the current date is used as identifier.
    
    begin_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        ``end_test_date``. If either of them is not provided, the test dataset is built using the 
        ``years_test`` argument. ``begin_test_date`` should either be a string with the following 
        format ``"%d/%m/%Y %H:%M"``, or a datetime object.
    
    end_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        ``begin_test_date``. If either of them is not provided, the test dataset is built using the 
        ``years_test`` argument. ``end_test_date`` should either be a string with the following 
        format ``"%d/%m/%Y %H:%M"``, or a datetime object.       
    
    """

    # Checking if provided directory for hyperparameter exists and if not create it
    if not os.path.exists(path_hyperparameters_folder):
        os.makedirs(path_hyperparameters_folder)

    if experiment_id is None:
        experiment_id = datetime.now().strftime("%d-%m-%Y_%H:%M:%S")
    else:
        experiment_id = experiment_id

    # Defining unique trials file name (this is an unique identifier)
    trials_file_name = 'DNN_hyperparameters_nl' + str(nlayers) + '_dat' + str(dataset) + \
                       '_YT' + str(years_test) + '_SF' * (shuffle_train) + \
                       '_DA' * (data_augmentation) + '_CW' + str(calibration_window) + \
                       '_' + str(experiment_id)

    trials_file_path = os.path.join(path_hyperparameters_folder, trials_file_name)

    # If hyperparameter optimization starts from scratch, new trials object is created. If not,
    # we read existing trials object
    if new_hyperopt:
        trials = Trials()
    else:
        trials = pc.load(open(trials_file_path, "rb"))


    # Generate training and test datasets
    dfTrain, dfTest = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder,
                                begin_test_date=begin_test_date, end_test_date=end_test_date)

    n_exogenous_inputs = len(dfTrain.columns) - 1

    # Build hyperparamerter search space. This includes hyperparameter and features
    space = _build_space(nlayers, data_augmentation, n_exogenous_inputs)


    # Perform hyperparameter optimization
    fmin_objective = partial(_hyperopt_objective, trials=trials, trials_file_path=trials_file_path, 
                             max_evals=max_evals, nlayers=nlayers, dfTrain=dfTrain, dfTest=dfTest, 
                             shuffle_train=shuffle_train, dataset=dataset, 
                             data_augmentation=data_augmentation, calibration_window=calibration_window,
                             n_exogenous_inputs=n_exogenous_inputs)

    fmin(fmin_objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials, verbose=False)

Exemple #4

0

Afficher le fichier

Fichier : _dnn.py Projet : FedericoGarza/epftoolbox

def evaluate_dnn_in_test_dataset(
        experiment_id,
        path_datasets_folder=os.path.join('.', 'datasets'),
        path_hyperparameter_folder=os.path.join('.', 'experimental_files'),
        path_recalibration_folder=os.path.join('.', 'experimental_files'),
        nlayers=2,
        dataset='PJM',
        years_test=2,
        shuffle_train=True,
        data_augmentation=0,
        calibration_window=4,
        new_recalibration=False,
        begin_test_date=None,
        end_test_date=None):
    """Function for easy evaluation of the DNN model in a test dataset using daily recalibration. 
    
    The test dataset is defined by a market name and the test dates dates. The function
    generates the test and training datasets, and evaluates a DNN model considering daily recalibration
    and an optimal set of hyperparameters. 
    
    Note that before using this class, a hyperparameter optimization run must be done using the
    :class:`hyperparameter_optimizer` function. Moreover, the hyperparameter optimization must be done
    using the same parameters: ``nlayers``, ``dataset``, ``shuffle_train``, 
    ``data_augmentation``, ``calibration_window``, and either the ``years_test`` or the same
    ``begin_test_date``/``end_test_date``
    
    An example on how to use this function is provided :ref:`here<dnnex2>`.

    Parameters
    ----------
    experiment_id : str
        Unique identifier to read the trials file. In particular, every hyperparameter optimization 
        set has an unique identifier associated with. See :class:`hyperparameter_optimizer` for further
        details
    path_datasets_folder : str, optional
        Path where the datasets are stored or, if they do not exist yet, the path where the datasets 
        are to be stored
    path_hyperparameter_folder : str, optional
        Path of the folder containing the trials file with the optimal hyperparameters
    path_recalibration_folder : str, optional
        Path to save the forecast of the test dataset
    nlayers : int, optional
        Number of hidden layers in the neural network
    dataset : str, optional
        Name of the dataset/market under study. If it is one one of the standard markets, 
        i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name
        is different, a dataset with a csv format should be place in the ``path_datasets_folder``.
    years_test : int, optional
        Number of years (a year is 364 days) in the test dataset. It is only used if 
        the arguments begin_test_date and end_test_date are not provided.
    begin_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        end_test_date. If either of them is not provided, the test dataset is built using the 
        years_test argument. begin_test_date should either be a string with the following 
        format d/m/Y H:M, or a datetime object
    end_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        begin_test_date. If either of them is not provided, the test dataset is built using the 
        years_test argument. end_test_date should either be a string with the following 
        format d/m/Y H:M, or a datetime object       
    shuffle_train : bool, optional
        Boolean that selects whether the validation and training datasets were shuffled when
        performing the hyperparameter optimization. Note that it does not select whether
        shuffling is used for recalibration as for recalibration the validation and the
        training datasets are always shuffled.
    data_augmentation : bool, optional
        Boolean that selects whether a data augmentation technique for electricity price forecasting
        is employed
    calibration_window : int, optional
        Number of days used in the training/validation dataset for recalibration
    new_recalibration : bool, optional
        Boolean that selects whether a new recalibration is performed or the function re-starts an old one.
        To restart an old one, the .csv file with the forecast must exist in the 
        ``path_recalibration_folder`` folder 
    
    Returns
    -------
    pandas.DataFrame
        A dataframe with all the predictions in the test dataset. The dataframe is also
        written to the folder ``path_recalibration_folder``
    """

    # Checking if provided directory for recalibration exists and if not create it
    if not os.path.exists(path_recalibration_folder):
        os.makedirs(path_recalibration_folder)

    # Defining train and testing data
    df_train, df_test = read_data(dataset=dataset,
                                  years_test=years_test,
                                  path=path_datasets_folder,
                                  begin_test_date=begin_test_date,
                                  end_test_date=end_test_date)
    # Defining unique name to save the forecast

    forecast_file_name = 'DNN_forecast_nl' + str(nlayers) + '_dat' + str(dataset) + \
                         '_YT' + str(years_test) + '_SFH' + str(shuffle_train) + \
                         '_DA' * data_augmentation + '_CW' + str(calibration_window) + \
                         '_' + str(experiment_id) + '.csv'

    forecast_file_path = os.path.join(path_recalibration_folder,
                                      forecast_file_name)

    # Defining empty forecast array and the real values to be predicted in a more friendly format
    forecast = pd.DataFrame(index=df_test.index[::24],
                            columns=['h' + str(k) for k in range(24)])
    real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24)
    real_values = pd.DataFrame(real_values,
                               index=forecast.index,
                               columns=forecast.columns)

    # If we are not starting a new recalibration but re-starting an old one, we import the
    # existing files and print metrics
    if not new_recalibration:
        # Import existinf forecasting file
        forecast = pd.read_csv(forecast_file_path, index_col=0)
        forecast.index = pd.to_datetime(forecast.index)

        # Reading dates to still be forecasted by checking NaN values
        forecast_dates = forecast[forecast.isna().any(axis=1)].index

        # If all the dates to be forecasted have already been forecast, we print information
        # and exit the script
        if len(forecast_dates) == 0:

            mae = np.mean(MAE(forecast.values.squeeze(), real_values.values))
            smape = np.mean(
                sMAPE(forecast.values.squeeze(), real_values.values)) * 100
            print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(
                'Final metrics', smape, mae))

    else:
        forecast_dates = forecast.index

    model = DNN(experiment_id=experiment_id,
                path_hyperparameter_folder=path_hyperparameter_folder,
                nlayers=nlayers,
                dataset=dataset,
                years_test=years_test,
                shuffle_train=shuffle_train,
                data_augmentation=data_augmentation,
                calibration_window=calibration_window)

    # For loop over the recalibration dates
    for date in forecast_dates:

        # For simulation purposes, we assume that the available data is
        # the data up to current date where the prices of current date are not known
        data_available = pd.concat(
            [df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0)

        # We set the real prices for current date to NaN in the dataframe of available data
        data_available.loc[date:date + pd.Timedelta(hours=23),
                           'Price'] = np.NaN

        # Recalibrating the model with the most up-to-date available data and making a prediction
        # for the next day
        Yp = model.recalibrate_and_forecast_next_day(df=data_available,
                                                     next_day_date=date)

        # Saving the current prediction
        forecast.loc[date, :] = Yp

        # Computing metrics up-to-current-date
        mae = np.mean(
            MAE(forecast.loc[:date].values.squeeze(),
                real_values.loc[:date].values))
        smape = np.mean(
            sMAPE(forecast.loc[:date].values.squeeze(),
                  real_values.loc[:date].values)) * 100

        # Pringint information
        print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(
            str(date)[:10], smape, mae))

        # Saving forecast
        forecast.to_csv(forecast_file_path)

    return forecast

Exemple #5

0

Afficher le fichier

Fichier : _lear.py Projet : valeman/epftoolbox

def evaluate_lear_in_test_dataset(path_datasets_folder=os.path.join('.', 'datasets'), 
                                  path_recalibration_folder=os.path.join('.', 'experimental_files'),
                                  dataset='PJM', years_test=2, calibration_window=364 * 3, 
                                  begin_test_date=None, end_test_date=None):
    """Function for easy evaluation of the LEAR model in a test dataset using daily recalibration. 
    
    The test dataset is defined by a market name and the test dates dates. The function
    generates the test and training datasets, and evaluates a LEAR model considering daily recalibration. 
    
    An example on how to use this function is provided :ref:`here<learex1>`.   

    Parameters
    ----------
    path_datasets_folder : str, optional
        path where the datasets are stored or, if they do not exist yet,
        the path where the datasets are to be stored.
    
    path_recalibration_folder : str, optional
        path to save the files of the experiment dataset.
    
    dataset : str, optional
        Name of the dataset/market under study. If it is one one of the standard markets, 
        i.e. ``"PJM"``, ``"NP"``, ``"BE"``, ``"FR"``, or ``"DE"``, the dataset is automatically downloaded. If the name
        is different, a dataset with a csv format should be place in the ``path_datasets_folder``.

    years_test : int, optional
        Number of years (a year is 364 days) in the test dataset. It is only used if 
        the arguments ``begin_test_date`` and ``end_test_date`` are not provided.
    
    calibration_window : int, optional
        Number of days used in the training dataset for recalibration.
    
    begin_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        ``end_test_date``. If either of them is not provided, the test dataset is built using the 
        ``years_test`` argument. ``begin_test_date`` should either be a string with the following 
        format ``"%d/%m/%Y %H:%M"``, or a datetime object.
    
    end_test_date : datetime/str, optional
        Optional parameter to select the test dataset. Used in combination with the argument
        ``begin_test_date``. If either of them is not provided, the test dataset is built using the 
        ``years_test`` argument. ``end_test_date`` should either be a string with the following 
        format ``"%d/%m/%Y %H:%M"``, or a datetime object.       
    
    Returns
    -------
    pandas.DataFrame
        A dataframe with all the predictions in the test dataset. The dataframe is also written to path_recalibration_folder.
    """

    # Checking if provided directory for recalibration exists and if not create it
    if not os.path.exists(path_recalibration_folder):
        os.makedirs(path_recalibration_folder)

    # Defining train and testing data
    df_train, df_test = read_data(dataset=dataset, years_test=years_test, path=path_datasets_folder,
                                  begin_test_date=begin_test_date, end_test_date=end_test_date)

    # Defining unique name to save the forecast
    forecast_file_name = 'LEAR_forecast' + '_dat' + str(dataset) + '_YT' + str(years_test) + \
                         '_CW' + str(calibration_window) + '.csv'

    forecast_file_path = os.path.join(path_recalibration_folder, forecast_file_name)


    # Defining empty forecast array and the real values to be predicted in a more friendly format
    forecast = pd.DataFrame(index=df_test.index[::24], columns=['h' + str(k) for k in range(24)])
    real_values = df_test.loc[:, ['Price']].values.reshape(-1, 24)
    real_values = pd.DataFrame(real_values, index=forecast.index, columns=forecast.columns)

    forecast_dates = forecast.index

    model = LEAR(calibration_window=calibration_window)

    # For loop over the recalibration dates
    for date in forecast_dates:

        # For simulation purposes, we assume that the available data is
        # the data up to current date where the prices of current date are not known
        data_available = pd.concat([df_train, df_test.loc[:date + pd.Timedelta(hours=23), :]], axis=0)

        # We set the real prices for current date to NaN in the dataframe of available data
        data_available.loc[date:date + pd.Timedelta(hours=23), 'Price'] = np.NaN

        # Recalibrating the model with the most up-to-date available data and making a prediction
        # for the next day
        Yp = model.recalibrate_and_forecast_next_day(df=data_available, next_day_date=date, 
                                                     calibration_window=calibration_window)
        # Saving the current prediction
        forecast.loc[date, :] = Yp

        # Computing metrics up-to-current-date
        mae = np.mean(MAE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) 
        smape = np.mean(sMAPE(forecast.loc[:date].values.squeeze(), real_values.loc[:date].values)) * 100

        # Pringint information
        print('{} - sMAPE: {:.2f}%  |  MAE: {:.3f}'.format(str(date)[:10], smape, mae))

        # Saving forecast
        forecast.to_csv(forecast_file_path)

    return forecast