import pandas as pd import seaborn as sns import matplotlib import matplotlib.pyplot as plt from pathlib import Path import src.constants.files as files import src.constants.models as md import src.constants.columns as c from src.evaluation.evaluation import prepare_data_for_deepar_plot from src.evaluation.plots import plot_deepar_forecasts from src.deepar.deepar_core import predictor_path, make_predictions STABILITY_STUDY_PATH = files.create_folder( os.path.join(files.OUTPUT_DATA, "deepar_stability_study")) MODEL_STABILITY_STUDY_PLOTS = files.create_folder(os.path.join(STABILITY_STUDY_PATH, "model_stability_study_plots")) def model_stability_study_results_path(fixed_seeds=False): if fixed_seeds: stability_results_file = "fixed_seeds_model_stability_study_results.csv" else: stability_results_file = "model_stability_study_results.csv" return os.path.join(STABILITY_STUDY_PATH, stability_results_file) NUM_EVAL_SAMPLES_STABILITY_STUDY_PLOTS = files.create_folder( os.path.join(STABILITY_STUDY_PATH, "num_eval_samples_stability_study_plots"))
from gluonts.model.deepar import DeepAREstimator from gluonts.trainer import Trainer from gluonts.dataset.common import ListDataset from gluonts.evaluation.backtest import make_evaluation_predictions from datetime import timedelta import pickle import os import src.constants.models as md import src.constants.files as files import logging DEEPAR_MODELS_PATH = files.create_folder(os.path.join(files.MODELS, "deepar")) def train_predictor(region_df_dict, end_train_date, regions_list, max_epochs, learning_rate, target_col, feat_dynamic_cols=None): estimator = DeepAREstimator(freq=md.FREQ, prediction_length=md.NB_HOURS_PRED, trainer=Trainer(epochs=max_epochs, learning_rate=learning_rate, learning_rate_decay_factor=md.LR_DECAY_FACTOR), use_feat_dynamic_real=feat_dynamic_cols is not None) if feat_dynamic_cols is not None: training_data = ListDataset( [{"item_id": region, "start": region_df_dict[region].index[0], "target": region_df_dict[region][target_col][:end_train_date],
import pickle from time import time import logging import os from src.prophet.prophet_core import format_training_data import src.constants.files as files import src.constants.models as md import src.constants.columns as c from fbprophet import Prophet PROPHET_MODELS_PATH = files.create_folder(os.path.join(files.MODELS, "prophet")) def prophet_train(): logging.info("Preparing data for Prophet training.") region_df_dict = pickle.load(open(files.REGION_DF_DICT, "rb")) df_dict = region_df_dict[md.IDF] df_prophet_train = format_training_data(df_dict, md.START_TRAIN_DATE, md.END_TRAIN_DATE) logging.info("Training Prophet model on 2 years.") start_time = time() model_energy = Prophet(yearly_seasonality=True) model_energy.fit(df_prophet_train) with open(os.path.join(PROPHET_MODELS_PATH, files.PROPHET_2_YEARS_MODEL), "wb") as file: pickle.dump(model_energy, file)
import pandas as pd import os from sklearn.linear_model import LinearRegression from joblib import dump import logging import src.constants.files as files import src.constants.columns as c from src.lin_reg.lin_reg_core import split_train_test LIN_REG_MODELS_PATH = files.create_folder(os.path.join(files.MODELS, "lin_reg")) def lin_reg_train(): """ Train linear regression with World GDP as input and World energy consumption as label to predict. :return: None """ world_gdp_energy = pd.read_csv( os.path.join(files.INTERIM_DATA, files.GDP_ENERGY_DATA_CSV)) train_df, test_df = split_train_test(world_gdp_energy) min_year = train_df[c.EnergyConsumptionGDP.YEAR].min() max_year = train_df[c.EnergyConsumptionGDP.YEAR].max() logging.info( f"Training linear regression on {min_year} to {max_year} world GDP and energy data." )
import matplotlib import matplotlib.pyplot as plt import os import numpy as np import pandas as pd from joblib import load import src.constants.columns as c import src.constants.files as files import src.constants.models as md from src.lin_reg.lin_reg_core import split_train_test from src.lin_reg.lin_reg_train import LIN_REG_MODELS_PATH from src.utils import mean_absolute_percentage_error LIN_REG_PLOTS = files.create_folder(os.path.join(files.PLOTS, "lin_reg")) # %matplotlib inline # + world_gdp_energy = pd.read_csv( os.path.join(files.INTERIM_DATA, files.GDP_ENERGY_DATA_CSV)) train_df, test_df = split_train_test(world_gdp_energy) lin_reg = load( os.path.join(LIN_REG_MODELS_PATH, "world_gdp_fuel_lin_reg_1965_to_2000.joblib")) predictions = lin_reg.predict( test_df[c.EnergyConsumptionGDP.WORLD_GDP_BILLION_USD].values.reshape( -1, 1))
import matplotlib import pandas as pd import matplotlib.pyplot as plt import os import numpy as np import logging from statsmodels.tsa.statespace.sarimax import SARIMAXResults from src.core import mean_absolute_percentage_error import src.constants.columns as c import src.constants.files as files import src.constants.models as md from src.sarima.sarima_train import SARIMA_MODELS_PATH as SARIMA_MODELS_PATH DEEPAR_PLOTS = files.create_folder(os.path.join(files.PLOTS, "deepar")) PROPHET_PLOTS = files.create_folder(os.path.join(files.PLOTS, "prophet")) SARIMA_PLOTS = files.create_folder(os.path.join(files.PLOTS, "sarima")) LABEL_FONTSIZE = 16 TITLE_FONTSIZE = 20 def plot_consumptions(region_df_dict, year, month): matplotlib.rcParams.update({'font.size': 22}) plt.figure(1, figsize=(25, 12)) for region in region_df_dict.keys(): df_region = region_df_dict[region] df_region[c.EnergyConso.DATE_HEURE] = df_region.index df_region = df_region[(df_region[c.EnergyConso.DATE_HEURE].apply(lambda x: x.year)==year)
from datetime import timedelta from itertools import product import pickle import statsmodels.api as sm import os import matplotlib.pyplot as plt import src.constants.files as files import src.constants.models as md import src.constants.columns as c from src.sarima.sarima_core import tsplot, optimize_arima SARIMA_MODELS_PATH = files.create_folder(os.path.join(files.MODELS, "sarima")) PLOTS_PATH = SARIMA_MODELS_PATH def sarima_train(max_arima_param_range): region_df_dict = pickle.load(open(files.REGION_DF_DICT, "rb")) idf_df = region_df_dict[md.IDF] idf_df[c.EnergyConso.CONSUMPTION] = idf_df[c.EnergyConso.CONSUMPTION].fillna(idf_df[c.EnergyConso.CONSUMPTION].mean()) idf_train = idf_df[md.END_TRAIN_DATE - timedelta(days=365):md.END_TRAIN_DATE] plot_stat_tests(idf_train) ps = range(1, max_arima_param_range) d = 1 qs = range(1, max_arima_param_range) Ps = range(1, max_arima_param_range) D = 1
import pandas as pd import os import src.constants.files as files import src.constants.columns as c import src.constants.models as md import logging REAL_DATA_PATH = files.create_folder( os.path.join(files.PROJECT_ROOT_PATH, "data")) REAL_RAW_DATA_PATH = files.create_folder(os.path.join(REAL_DATA_PATH, "raw")) TEST_RAW_DATA_PATH = os.path.join(files.PROJECT_ROOT_PATH, "tests", "free_integration_test", "data", "raw") def prepare_raw_test_data(force_recompute=False): real_raw_csv_files = [ file for file in os.listdir(REAL_RAW_DATA_PATH) if file.endswith(".csv") ] test_raw_csv_files = [ file for file in os.listdir(TEST_RAW_DATA_PATH) if file.endswith(".csv") ] files_to_copy_in_test = [ file for file in real_raw_csv_files if file not in test_raw_csv_files ] if force_recompute: