Exemple #1
0
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from pathlib import Path

import src.constants.files as files
import src.constants.models as md
import src.constants.columns as c

from src.evaluation.evaluation import prepare_data_for_deepar_plot
from src.evaluation.plots import plot_deepar_forecasts
from src.deepar.deepar_core import predictor_path, make_predictions

STABILITY_STUDY_PATH = files.create_folder(
    os.path.join(files.OUTPUT_DATA, "deepar_stability_study"))

MODEL_STABILITY_STUDY_PLOTS = files.create_folder(os.path.join(STABILITY_STUDY_PATH, "model_stability_study_plots"))

def model_stability_study_results_path(fixed_seeds=False):
    if fixed_seeds:
        stability_results_file = "fixed_seeds_model_stability_study_results.csv"
    else:
        stability_results_file = "model_stability_study_results.csv"

    return os.path.join(STABILITY_STUDY_PATH, stability_results_file)


NUM_EVAL_SAMPLES_STABILITY_STUDY_PLOTS = files.create_folder(
    os.path.join(STABILITY_STUDY_PATH, "num_eval_samples_stability_study_plots"))
from gluonts.model.deepar import DeepAREstimator
from gluonts.trainer import Trainer
from gluonts.dataset.common import ListDataset
from gluonts.evaluation.backtest import make_evaluation_predictions

from datetime import timedelta
import pickle
import os

import src.constants.models as md
import src.constants.files as files

import logging

DEEPAR_MODELS_PATH = files.create_folder(os.path.join(files.MODELS, "deepar"))


def train_predictor(region_df_dict, end_train_date, regions_list, max_epochs, learning_rate, target_col,
                    feat_dynamic_cols=None):

    estimator = DeepAREstimator(freq=md.FREQ,
                                prediction_length=md.NB_HOURS_PRED,
                                trainer=Trainer(epochs=max_epochs, learning_rate=learning_rate,
                                                learning_rate_decay_factor=md.LR_DECAY_FACTOR),
                                use_feat_dynamic_real=feat_dynamic_cols is not None)
    if feat_dynamic_cols is not None:

        training_data = ListDataset(
            [{"item_id": region,
              "start": region_df_dict[region].index[0],
              "target": region_df_dict[region][target_col][:end_train_date],
import pickle
from time import time
import logging
import os

from src.prophet.prophet_core import format_training_data
import src.constants.files as files
import src.constants.models as md
import src.constants.columns as c

from fbprophet import Prophet

PROPHET_MODELS_PATH = files.create_folder(os.path.join(files.MODELS,
                                                       "prophet"))


def prophet_train():
    logging.info("Preparing data for Prophet training.")
    region_df_dict = pickle.load(open(files.REGION_DF_DICT, "rb"))
    df_dict = region_df_dict[md.IDF]

    df_prophet_train = format_training_data(df_dict, md.START_TRAIN_DATE,
                                            md.END_TRAIN_DATE)

    logging.info("Training Prophet model on 2 years.")
    start_time = time()
    model_energy = Prophet(yearly_seasonality=True)
    model_energy.fit(df_prophet_train)
    with open(os.path.join(PROPHET_MODELS_PATH, files.PROPHET_2_YEARS_MODEL),
              "wb") as file:
        pickle.dump(model_energy, file)
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from joblib import dump
import logging

import src.constants.files as files
import src.constants.columns as c

from src.lin_reg.lin_reg_core import split_train_test

LIN_REG_MODELS_PATH = files.create_folder(os.path.join(files.MODELS,
                                                       "lin_reg"))


def lin_reg_train():
    """
    Train linear regression with World GDP as input and World energy consumption as label to predict.
    
    :return: None
    """
    world_gdp_energy = pd.read_csv(
        os.path.join(files.INTERIM_DATA, files.GDP_ENERGY_DATA_CSV))

    train_df, test_df = split_train_test(world_gdp_energy)
    min_year = train_df[c.EnergyConsumptionGDP.YEAR].min()
    max_year = train_df[c.EnergyConsumptionGDP.YEAR].max()

    logging.info(
        f"Training linear regression on {min_year} to {max_year} world GDP and energy data."
    )
import matplotlib
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd
from joblib import load

import src.constants.columns as c
import src.constants.files as files
import src.constants.models as md

from src.lin_reg.lin_reg_core import split_train_test
from src.lin_reg.lin_reg_train import LIN_REG_MODELS_PATH
from src.utils import mean_absolute_percentage_error

LIN_REG_PLOTS = files.create_folder(os.path.join(files.PLOTS, "lin_reg"))

# %matplotlib inline

# +
world_gdp_energy = pd.read_csv(
    os.path.join(files.INTERIM_DATA, files.GDP_ENERGY_DATA_CSV))

train_df, test_df = split_train_test(world_gdp_energy)

lin_reg = load(
    os.path.join(LIN_REG_MODELS_PATH,
                 "world_gdp_fuel_lin_reg_1965_to_2000.joblib"))
predictions = lin_reg.predict(
    test_df[c.EnergyConsumptionGDP.WORLD_GDP_BILLION_USD].values.reshape(
        -1, 1))
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import logging
from statsmodels.tsa.statespace.sarimax import SARIMAXResults

from src.core import mean_absolute_percentage_error
import src.constants.columns as c
import src.constants.files as files
import src.constants.models as md
from src.sarima.sarima_train import SARIMA_MODELS_PATH as SARIMA_MODELS_PATH


DEEPAR_PLOTS = files.create_folder(os.path.join(files.PLOTS, "deepar"))
PROPHET_PLOTS = files.create_folder(os.path.join(files.PLOTS, "prophet"))
SARIMA_PLOTS = files.create_folder(os.path.join(files.PLOTS, "sarima"))

LABEL_FONTSIZE = 16
TITLE_FONTSIZE = 20


def plot_consumptions(region_df_dict, year, month):
    matplotlib.rcParams.update({'font.size': 22})

    plt.figure(1, figsize=(25, 12))
    for region in region_df_dict.keys():
        df_region = region_df_dict[region]
        df_region[c.EnergyConso.DATE_HEURE] = df_region.index
        df_region = df_region[(df_region[c.EnergyConso.DATE_HEURE].apply(lambda x: x.year)==year)
Exemple #7
0
from datetime import timedelta
from itertools import product
import pickle
import statsmodels.api as sm
import os
import matplotlib.pyplot as plt

import src.constants.files as files
import src.constants.models as md
import src.constants.columns as c

from src.sarima.sarima_core import tsplot, optimize_arima

SARIMA_MODELS_PATH = files.create_folder(os.path.join(files.MODELS, "sarima"))
PLOTS_PATH = SARIMA_MODELS_PATH


def sarima_train(max_arima_param_range):
    region_df_dict = pickle.load(open(files.REGION_DF_DICT, "rb"))
    idf_df = region_df_dict[md.IDF]
    idf_df[c.EnergyConso.CONSUMPTION] = idf_df[c.EnergyConso.CONSUMPTION].fillna(idf_df[c.EnergyConso.CONSUMPTION].mean())

    idf_train = idf_df[md.END_TRAIN_DATE - timedelta(days=365):md.END_TRAIN_DATE]

    plot_stat_tests(idf_train)

    ps = range(1, max_arima_param_range)
    d = 1
    qs = range(1, max_arima_param_range)
    Ps = range(1, max_arima_param_range)
    D = 1
import pandas as pd
import os

import src.constants.files as files
import src.constants.columns as c
import src.constants.models as md

import logging

REAL_DATA_PATH = files.create_folder(
    os.path.join(files.PROJECT_ROOT_PATH, "data"))
REAL_RAW_DATA_PATH = files.create_folder(os.path.join(REAL_DATA_PATH, "raw"))
TEST_RAW_DATA_PATH = os.path.join(files.PROJECT_ROOT_PATH, "tests",
                                  "free_integration_test", "data", "raw")


def prepare_raw_test_data(force_recompute=False):
    real_raw_csv_files = [
        file for file in os.listdir(REAL_RAW_DATA_PATH)
        if file.endswith(".csv")
    ]
    test_raw_csv_files = [
        file for file in os.listdir(TEST_RAW_DATA_PATH)
        if file.endswith(".csv")
    ]

    files_to_copy_in_test = [
        file for file in real_raw_csv_files if file not in test_raw_csv_files
    ]

    if force_recompute: