def mod(df, p, d, q, P, D, Q, lag): # print df model = SARIMAX(endog=df['Y'].values, order=(p, d, q), seasonal_order=(P, D, Q, lag)).fit() print model.summary() plt.plot(df.index, model.resid) plt.show() plot_acf_pacf(model.resid[7:], 21)
class Sarimax: def __init__(self, df, cfg): self.series = df[cfg['target_feature']] self.model = SARIMAX(self.series, order=(3, 1, 0), seasonal_order=(0, 0, 0, 12)) def fit_model(self): # Fit model self.model = self.model.fit(disp=0) print(self.model.summary()) def plot_autocorrelation(self): # Plot auto correlation autocorrelation_plot(self.series) plt.show() def predict_arima(self, series): return self.model.predict(series)
def Auto_Arima(df,dirloc,filename): import itertools from statsmodels.tsa.statespace.sarimax import SARIMAX p=d=q=range(0,3) pdq = list(itertools.product(p,d,q)) seas_decomp=[] for x in pdq: x1=(x[0],x[1],x[2],12) seas_decomp.append(x1) print("Computating AIC of Different Sesonal ARIMA.....\n") arima_order=[] seas_order=[] aic_val=[] for params in pdq: for seas_par in seas_decomp: mod = SARIMAX(df,order=params,seasonal_order=seas_par,enforce_stationarity=False, enforce_invertibility=False,freq="MS").fit() arima_order.append(params) seas_order.append(seas_par) aic_val.append(round(mod.aic,2)) print("SARIMA: {} X {} | AIC = {}".format(params,seas_par,round(mod.aic,2))) results = pd.DataFrame({"ARIMA Order":arima_order,"Seasonal Order":seas_order,"AIC Value":aic_val}) results_sorted = results.sort_values(by="AIC Value",ascending=True) results_sorted=results_sorted.reset_index(drop=True) print("Selected SARIMA Order:",results_sorted.head(2)) final_model = SARIMAX(df,order=results_sorted["ARIMA Order"][0],seasona_order=results_sorted["Seasonal Order"][0],enforce_stationarity=False, enforce_invertibility=False,freq="MS").fit() print("Final Model Result Summary {}".format(final_model.summary())) print(results_sorted["ARIMA Order"][0]) print(results_sorted["Seasonal Order"][0]) predictions = final_model.predict(start=dt.datetime.strptime("2020-06-01","%Y-%m-%d"),end=dt.datetime.strptime("2020-12-01","%Y-%m-%d")) print("Average Monthly WTI Crude Oil Spot Price from June to Dec 2020:") print(predictions) with open(os.path.join(dirloc[:-5],outputfile),"a") as f: f.write("Simulation Result of SARIMA....\n") f.write(str(results_sorted)) f.write("\n") f.write(str(predictions)) f.close() return results_sorted
''' # Prepare the data X = timeseriesgenerator y = from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # ========================= SARIMAX ========================= from statsmodels.tsa.statespace.sarimax import SARIMAX mod = SARIMAX(data['ln_wpi'], trend='c', order=(1,1,(1,0,0,1))) mod = mod.fit(X_train) print(mod.summary()) pred = mod.predict(X_test) plt.plot(X, y) plt.plot(X_test, pred) # ========================= XGBoost ========================= from xgboost import XGBRegressor xgb = XGBRegressor() xgb.fit(X_train_scaled, y_train) pred = xgb.predict(X_test)
df_comp=df_comp.fillna(method='ffill') # -- redefine column names and add a new column on returns - we will be working on returns df_comp['market_value']=df_comp.ftse # split dataset (on straight data = prices) # ---------- size = int(len(df_comp) * 0.8) df = df_comp.iloc[:size] df_test = df_comp.iloc[size:] # review ACF and PACF (in reality is more functional to run auto_arima vs checking ACF/PACF manually, but this is for sake of example) # ---------- # not done here # run SARIMAX model using S&P500 values as exogenous factor to explain FTSE values # ---------- model_sarimax = SARIMAX(df.market_value, order=(1,0,1), seasonal_order=(2,0,1,5), exog=df.spx).fit() print(model_sarimax.summary()) print('----------')
# %% fig, (ax7, ax8) = plt.subplots(1,2, figsize=(16, 4)) plot_acf(airpassengers_train, ax7) ax3.set_title('ACF of seasonal series') plot_pacf(airpassengers_train, ax8) ax4.set_title('PACF of seasonal series') plt.show() # %% sarimax = SARIMAX(airpassengers_train, order=(3,1,1), seasonal_order=(0,1,0,12)).fit() sarimax.summary() # %% sarimax.plot_diagnostics(figsize=(16, 8)) plt.show() # %% sarimax_forecast = sarimax.get_forecast(24) sarimax_forecast_conf_int = sarimax_forecast.conf_int() # %% plt.plot(airpassengers_train, label='train') plt.plot(airpassengers_test, label='test')
#___________________________ #Training the model #___________________________ #ARIMA(1, 1, 1) SARIMAX(1, 0, 1, 52) model = SARIMAX(series, order=(1, 1, 1), seasonal_order=(1, 1, 1, 52), trend='n', enforce_stationarity=False, enforce_invertibility=False).fit() print("________________________") print("MODEL SUMMARY") print(model.summary().tables[1]) # Nice way to check residuals follow a Gaussian distribution model.plot_diagnostics(figsize=(15, 12)) plt.show() train_pred = model.predict() train_pred_cpy = train_pred.copy() print(train_pred_cpy) print(type(train_pred_cpy)) print(type(series)) cdf_index = a_organic[0:train_size].index #print(cdf_index) #print(type(cdf_index))
def mod_sarima(train, test, dependent_var_col, trend, p, d, q, P, D, Q, S, is_log, outpath, name, xreg, plot_regressors, mle_regression=True, time_varying_regression=False, periodicity='daily'): """ This function trains and tests the SARIMA model. for this two dataframes must be given, train and test. trend, pdq and PDQS, are the statsmodels.SARIMAX variables. :param train (Pandas Dataframe): train data :param test (Pandas Dataframe): test data :param ts_col (int): column of the objective variable :param trend (str): Parameter controlling the deterministic trend polynomial A(t) :param p (int): Autorregresive parameter :param d (int): Differencing parameter :param q (int): Differencing Moving Average parameter :param P (int): Seasonal Autorregresive parameter :param D (int): Seasonal Differencing parameter :param Q (int): Seasonal Differencing Moving Average parameter :param S (int): Lags for the seasonal :param is_log (bool): true if the series is in logarithm. defaults to False. :param outpath (str): path where the results will be stored :param name (str): name to use when saving the files returned by the model :xreg(list): list of strings with names of columns in the test/train datasets to be used as regressors :plot_regressors: whether the regressors should be plotted in the function :return: mae_error (float): Mean Absolute Error rmse_error (float): root mean squared error res_df (Pandas Dataframe): Dataframe with all data and the prediction in the Forecast column. mod (statsmodel object): Model object. """ print( 'Modelling \n', name, ' Forecast - SARIMAX ' + '(' + str(p) + ',' + str(d) + ',' + str(q) + ')' + 'S' + '(' + str(P) + ',' + str(D) + ',' + str(Q) + ')' + str(S)) # path definition if name not in os.listdir(outpath): os.mkdir(outpath + name) print('creating output folder in: \n', outpath + name) report_output_path = str(outpath) + str(name) + '/' # fit the model if len(xreg) == 0: mod = SARIMAX(train[dependent_var_col], trend=trend, order=(p, d, q), seasonal_order=(P, D, Q, S), time_varying_regression=time_varying_regression, mle_regression=mle_regression).fit() else: mod = SARIMAX(train[dependent_var_col], trend=trend, order=(p, d, q), seasonal_order=(P, D, Q, S), exog=train[xreg], enforce_stationarity=False, time_varying_regression=time_varying_regression, mle_regression=mle_regression).fit() # plot diagnostics plt.figure() plt.title('Plot diagnostics for' + dependent_var_col + ' Forecast - SARIMA ' + '(' + str(p) + ',' + str(d) + ',' + str(q) + ')' + 'S' + '(' + str(P) + ',' + str(D) + ',' + str(Q) + ')' + str(S)) mod.plot_diagnostics(figsize=(15, 9), lags=40) plt.savefig(report_output_path + 'diagnostics_' + name + '.png') # predict with the model # I know this seems like a lot, but to be able to support broken time series in the forecast you need to reset the indexes test_aux = test.copy(deep=True) # TODO: remove this parameter test_aux[xreg] = np.exp(test_aux[xreg]) test_aux[xreg] = test_aux[xreg] * 0.9 test_aux[xreg] = np.log(test_aux[xreg]) test_aux.reset_index(drop=True, inplace=True) train_aux = train.copy(deep=True) train_aux.reset_index(drop=True, inplace=True) # get the predictions with the model if len(xreg) == 0: predictions = mod.predict(train_aux.index.max() + 1, end=train_aux.index.max() + 1 + test_aux.index.max()) conf_intervals = mod.get_prediction( train_aux.index.max() + 1, end=train_aux.index.max() + 1 + test_aux.index.max()).conf_int(alpha=0.5) else: predictions = mod.predict(train_aux.index.max() + 1, end=train_aux.index.max() + 1 + test_aux.index.max(), exog=test_aux[xreg]) conf_intervals = mod.get_prediction( train_aux.index.max() + 1, end=train_aux.index.max() + 1 + test_aux.index.max(), exog=test_aux[xreg]).conf_int(alpha=0.5) predictions.index = test.index conf_intervals.index = test.index # the confidence interval is trimmed for extreme values so they don't overextort after missing dates and doing the inverse log transf (exp) conf_intervals = pd.DataFrame(conf_intervals) # conf_intervals[(conf_intervals['lower log_revenue_emi'] < conf_intervals['lower log_revenue_emi'].quantile(q=0.01)) | ( # conf_intervals['upper log_revenue_emi'] > conf_intervals['upper log_revenue_emi'].quantile(q=0.99))] = np.nan conf_intervals.index = conf_intervals.index.date conf_intervals.index = conf_intervals.index.map(str) # assign the predictions to the test dataframe to be used later in the plotting test['Forecast'] = predictions train['Forecast'] = mod.fittedvalues # add the columns that are in the regressors to the dataframe that will be used and get a dataframe to plot (train aux) columns = [dependent_var_col, 'Forecast'] columns.append(xreg) columns = list(flatten(columns)) train_aux = train[columns] test_aux = test[columns] test_aux = pd.merge(test_aux, conf_intervals, left_index=True, right_index=True) # transform the data back from logarithm if the series is in that scale if is_log is True: res_df = pd.concat([train_aux, test_aux]) res_df['Forecast'] = np.exp(res_df['Forecast']) res_df[dependent_var_col] = np.exp(res_df[dependent_var_col]) mae_error = mean_absolute_error(np.exp(test[dependent_var_col]), np.exp(predictions)) rmse_error = np.sqrt( mean_squared_error(np.exp(test[dependent_var_col]), np.exp(predictions))) mape = mean_absolute_percentage_error(np.exp(test[dependent_var_col]), np.exp(predictions)) preds = np.exp(predictions) else: res_df = pd.concat([train_aux, test_aux]) mae_error = mean_absolute_error(test[dependent_var_col], predictions) rmse_error = np.sqrt( mean_squared_error(test[dependent_var_col], predictions)) mape = mean_absolute_percentage_error(test[dependent_var_col], predictions) preds = predictions # Create a text box for the iteration results textstr = 'MAE:' + str(round(mae_error, 0)) + '\n' + 'MAPE:' + str( round(mape, 2)) aux_res_df = res_df.tail(365) # only plot the 6 months aux_res_df.index = pd.to_datetime(aux_res_df.index) if str(periodicity).upper() is 'daily': aux_res_df = aux_res_df.reindex(pd.date_range(aux_res_df.index.min(), aux_res_df.index.max()), fill_value=np.nan) # Upper and lower confidence intervals lower = aux_res_df[str('lower ' + str(dependent_var_col))] upper = aux_res_df[str('upper ' + str(dependent_var_col))] if is_log is True: lower = np.exp(lower) upper = np.exp(upper) # plot the figure with the prediction fig, ax = plt.subplots(figsize=(15, 10)) plt.subplots_adjust(right=0.85, left=0.05, bottom=0.1) ax2 = ax.twinx() ax.plot(aux_res_df["Forecast"], color='darkred', label='Forecast') ax.plot(aux_res_df[dependent_var_col], color='darkblue', label='Real') if plot_regressors is True: for i in xreg: ax2.plot(aux_res_df[i], color='grey', alpha=0.4, label=str(i)) ax.plot(lower, color='darkgreen', label='Lower', alpha=0.5) ax.plot(upper, color='darkgreen', label='Upper', alpha=0.5) ax.fill_between(upper.dropna().index, upper.dropna(), lower.dropna(), facecolor='darkgreen', alpha=0.2, interpolate=False) ax.axvline(x=pd.to_datetime(test.index.min(), format='%Y-%m-%d'), color='grey', linestyle='--') ax.xaxis.set_major_locator(mticker.MultipleLocator(30)) plt.gcf().autofmt_xdate() # generate a text box props = dict(boxstyle='round', facecolor='white') # place a text box in upper left in axes coords ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=14, verticalalignment='top', bbox=props) ax.legend(title='Forecast Legend', bbox_to_anchor=(1.05, 1), loc='upper left') ax2.legend(title='Regressors', bbox_to_anchor=(1.05, 0.7), loc='center left') plt.savefig(report_output_path + 'Forecast_' + name + '_' + str( datetime.strftime(pd.to_datetime(test.index.min()), format='%Y-%m-%d')) + '.png') plt.title('SARIMAX Forecast of ' + name) plt.show() plt.close('all') # plotting the results in plotly fig = go.Figure() fig.add_trace( go.Scatter(x=res_df.index, y=res_df[dependent_var_col], mode='lines', name='Real')) fig.add_trace( go.Scatter(x=res_df.index, y=res_df['Forecast'], mode='lines+markers', name='Fitted - Forecasted')) fig.add_shape( dict(type="line", x0=test.index.min(), y0=res_df[dependent_var_col].min(), x1=test.index.min(), y1=res_df[dependent_var_col].max(), line=dict(color="grey", width=1))) fig.update_xaxes(rangeslider_visible=True) fig.update_layout(title=dependent_var_col + ' Forecast - SARIMA ' + '(' + str(p) + ',' + str(d) + ',' + str(q) + ')' + 'S' + '(' + str(P) + ',' + str(D) + ',' + str(Q) + ')' + str(S), xaxis_title=dependent_var_col, yaxis_title='Date', font=dict(family="Century gothic", size=18, color="darkgrey")) fig.write_html(report_output_path + name + '_forecast_SARIMA.html') plt.close('all') print('MAE', mae_error) print('RMSE', rmse_error) print('MAPE', mape) print(mod.summary()) return mae_error, rmse_error, mape, name, preds, conf_intervals
generator = TimeseriesGenerator(scaled_train, scaled_train, length=n_input, batch_size=30) # In[158]: model = Sequential() model.add(LSTM(100, activation='relu', input_shape=(n_input, n_features))) model.add(Dense(50)) model.add(Dense(1)) model.compile(optimizer='adam', loss='mse') # In[159]: model.summary() # In[160]: model.fit_generator(generator, epochs=30) # In[161]: model.history.history.keys() # In[162]: loss_per_epoch = model.history.history['loss'] plt.plot(range(len(loss_per_epoch)), loss_per_epoch) # In[163]:
Q = range(0, 2, 1) m = 120 parameters = product(p, q, P, Q) parameters_list = list(parameters) print(len(parameters_list)) ## Uncomment to find the optimization parameters: result_df = optimize_SARIMA(parameters_list, 1, 1, m, dft_f['cnt_smooth']) print('\nSarima Optimization\n', result_df) best_param = result_df['(p,q)x(P,Q)'][0] (p_best, q_best, P_best, Q_best) = best_param # best_model = SARIMAX(dft_f['cnt_smooth'], order=(1, 1, 1), seasonal_order=(0, 1, 1, m)).fit(dis=-1) best_model = SARIMAX(dft_f['cnt_smooth'], order=(p_best, 1, q_best), seasonal_order=(P_best, 1, Q_best, m)).fit(dis=-1) print(best_model.summary()) best_model.plot_diagnostics(figsize=(12,8)) plt.suptitle(f'Diagnostic Best model') plt.savefig(f'{baseSave}/diagnostic_plot_station_{s}.png', dpi=150) plt.clf() ''' END: Compute Sarima optimization ''' end_endto = args.forecast_upto.date() + timedelta(days=-1) dft_all = pd.read_csv(f'{baseSave}/smoothed_to_compare_s_{s}.csv', header=[0], index_col=[0], sep=';', parse_dates=True) dft_all_upto = dft_all.loc[selection[s]['stop']:end_endto.strftime('%Y-%m-%d')] dft_f_from = dft_f.loc[selection[s]['start']:] pred_uc = best_model.get_forecast(steps=pd.to_datetime(args.forecast_upto.date().strftime('%Y-%m-%d'))) pred_ci = pred_uc.conf_int()
order=param, seasonal_order=param_seasonal, enforce_stationarity=False, enforce_invertability=False) results = sarima_model.fit() print("ARIMA{}x{} - AIC:{:.2f}".format(param, param_seasonal, results.aic)) except: continue # plug in results with lowest AIC score sarima_model = SARIMAX(y, order=(1,1,1), seasonal_order=(0,1,1,12)) sarima_model = sarima_model.fit(disp=False) # summary table of SARIMA print("SARIMA summary table:") print(sarima_model.summary().tables[1]) # show plot diagnostics sarima_model.plot_diagnostics(figsize=(15,12)) plt.show() # Show predictions using one-step forecast pred = sarima_model.get_prediction(start=pd.to_datetime('1998-01-01'), dynamic=False) pred_ci = pred.conf_int() ax = y['1990':].plot(label='observed') pred.predicted_mean.plot(ax=ax, label='One step ahead forecast', alpha=0.7) ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=0.2) ax.set_xlabel('Date')
if st.sidebar.checkbox("Show forecasting for time entries", True): st.markdown("#### Time series prediction using SARIMAX") st.plotly_chart(fig) # Print the SARIMA MAE and model summary st.write('SARIMA MAE = ', mean_absolute_error(sarimax_prediction, test)) @st.cache(allow_output_mutation=True) def load_model(): model = pm.auto_arima(train, start_p=0, start_q=0, test='adf', max_p=7, max_q=7, m=1, d=None, seasonal=False, start_P=0, D=0, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) return model model = load_model() st.text(model.summary())
def projectexample_modelling(series, model_name, parameters): """ Function that performs the following plots shape of the series the first items :params series: univariate time series :type series: dataframe :return: - error (int): variable with error code """ # modelling error = 0 try: print("{} time series modelling".format('-' * 20)) print("{} {} model".format('-' * 20, model_name)) if model_name=='SARIMAX': p = parameters[0] d = parameters[1] q = parameters[2] P = parameters[3] D = parameters[4] Q = parameters[5] S = parameters[6] t = parameters[7] print("{} fitting model".format('-' * 20)) # fit the model model = SARIMAX(series.values, trend = t, order = (p, d, q), seasonal_order = (P, D, Q, S), enforce_stationarity = False, enforce_invertibility = False).fit() # Model summary print("{} Model summary".format('-' * 20)) print(model.summary().tables[1]) # Model diagnostic print("{} Model diagnostic".format('-' * 20)) fig = model.plot_diagnostics(figsize=(20, 12)) fig.savefig(os.path.join(os.getcwd(), 'figures\\diagnostic_{}.png'.format(model_name))) fig.show() except Exception as exception_msg: print('{} (!) Error in projectexample_modelling: '.format('-' * 20) + str(exception_msg)) error = 1 model = [] return model, error # Metrics print("{} Metrics".format('-' * 20)) try: # Regression metrics y_fitted = model.predict() R2 = round(r2_score(series, y_fitted), 3) MAE = round(mean_absolute_error(series, y_fitted), 3) RMSE = round(np.sqrt(mean_squared_error(series, y_fitted)), 3) print("{} R2: {}".format('-' * 20, R2)) print("{} MAE: {}".format('-' * 20, MAE)) print("{} RMSE: {}".format('-' * 20, RMSE)) except Exception as exception_msg: print('{} (!) Error in projectexample_modelling (metrics): '.format('-' * 20) + str(exception_msg)) error = 2 return model, error return model, error
test_last_date = pd.to_datetime(test_data.index[-1]) # seasonally_diffed_data = log_transformed_train_data.diff()[10:] # test_stationarity(seasonally_diffed_data) # shapiro_normaly_test(seasonally_diffed_data) # plot_data_properties(seasonally_diffed_data, 'Log transformed, diff=52 and seasonally differenced data') decomposition = plot_seasonal_decompose(df['Last'], 'multiplicative') # best_model, models = best_sarima_model(train_data=log_transformed_train_data,p=range(3),q=range(3),P=range(3),Q=range(3)) # preds_best = np.exp(best_model.predict(start='2019-01-01', dynamic=True, typ='levels')) # print(f'MAPE{np.round(mean_abs_pct_error(log_transformed_test_data,preds_best),2)}') agile_model = SARIMAX(endog=log_transformed_train_data, order=(1, 1, 2), seasonal_order=(1, 1, 2, 52), enforce_invertibility=False).fit() agile_model.summary() #just do deactive warnings regarding PyCharm and Numpy # noinspection PyTypeChecker agile_model_pred = np.exp( agile_model.predict(start=test_first_date, end=test_last_date, dynamic=True, typ='levels')) print(f'MAPE {np.round(mean_abs_pct_error(test_data,agile_model_pred),2)}%') # print(f'MAE:{np.round(mean_absolute_error(test_data,agile_model_pred),2)}') # noinspection PyTypeChecker agile_model_forecast = np.exp(agile_model.forecast(steps=2)) print(agile_model_forecast)
# summary = auto_arima(df1['total'], exogenous=df1[['holiday']], seasonal=True, m=7, error_action="ignore").summary() # print(summary) # => we find that the best model is SARIMAX(0,0,1) x (2,0,0,7) # fourth, split data set (total lines 478) # ---------- train = df1.iloc[:436] test = df1.iloc[436:] #fifth, run SARIMA train_model with the order determined by auto_arima # ---------- train_model = SARIMAX(train['total'], order=(0, 0, 1), seasonal_order=(2, 0, 0, 7), enforce_invertibility=False).fit() print(train_model.summary()) # enforce invertibility allows to keep coefficients below 1, just to avoid ValueError #sixth, test predictions vs test set # ---------- start = len(train) end = len(train) + len(test) - 1 predictions = train_model.predict( start, end, exog=test[['holiday']]).rename('SARIMAX predictions vs test') test['total'].plot(legend=True) predictions.plot(legend=True) plt.show() #seventh, evaluate the model on rmse error
def fit_sarimax(self): # sarimax= auto_arima(y=self.data_lag[["fallecimientos"]], # exogenous=self.data_lag[["casos_total"]], # start_p=1, start_q=1, # test='adf', # max_p=2, max_q=2, m=7, # start_P=0, seasonal=True, # d=None, D=1, trace=False, # error_action='ignore', # suppress_warnings=True, # stepwise=True) sarimax = SARIMAX(endog=self.data_lag.iloc[:-1, ][["fallecimientos"]], exog=self.data_lag.iloc[:-1, ][["casos_total"]], order=(0, 0, 3), seasonal_order=(0, 0, 0, 0)).fit() sum = sarimax.summary() predictions = pd.DataFrame( sarimax.forecast(steps=5, exog=self.forecast[["casos_total"]])) e = pd.DataFrame({ "Modelo": "SARIMAX", "Predicción de hoy": [predictions.iloc[0, 0]], "Error de hoy": [ abs(predictions.iloc[0, 0] - self.dt.loc[len(self.dt) - 1, "fallecimientos"]) ] }) predictions["fecha"] = self.dt.loc[len(self.dt) - 1, "fecha"] predictions.columns = ["fallecimientos", "fecha"] predictions.reset_index(drop=True, inplace=True) for i in range(len(self.forecast)): c = 0 c += i predictions.loc[i, "fecha"] = predictions.fecha[i] + timedelta(days=c) new = pd.concat( (self.dt[["fallecimientos", "fecha"]], predictions.iloc[1:, :]), axis=0) new["Predicciones"] = np.where( new.fecha <= self.dt.loc[len(self.dt) - 1, "fecha"], "Real", "Pred") fig = px.bar( new, x="fecha", y="fallecimientos", color="Predicciones", ) # predictions.columns =["Predicciones_Fallecimientos", "fecha"] # # load = str(self.dt.loc[len(self.dt)-1, "fecha"] - timedelta(days=1)) # load = load[0:10] + "_.pkl" # # with open(load, "rb") as file: # historic = pickle.load(file) # predictions["Error"] = 0 # p=pd.concat([predictions.reset_index(drop=True), historic], ignore_index=True) # p = p.loc[p.fecha <= self.dt.loc[len(self.dt)-1, "fecha"],:] # p.reset_index(drop=True, inplace=True) # for i in range(0,len(p)): # if self.dt.loc[len(self.dt)-1,"fecha"] == p.loc[i,"fecha"]: # p.loc[i,"Error"] = np.sqrt((self.dt.loc[len(self.dt)-1,"fallecimientos"] - p.loc[i,"Predicciones_Fallecimientos"])**2) # # save = str(self.dt.loc[len(self.dt)-1, "fecha"]) # save = save[0:10] + "_.pkl" # # with open(save, "wb") as file: # pickle.dump(p, file) return e, fig, sum
df_test = df[df.index > '2017-12-31'] lags = ar_select_order(df_train, maxlag=13, ic='bic', old_names=False).ar_lags print('(BIC) lags= ', len(lags), ':', lags) # AR and SARIMAX ## AR(p) is simplest time-model, can nest in SARIMAX(p,d,q,s) with ## moving average MA(q), integration order I(d), seasonality S(s), exogenous X from statsmodels.tsa.statespace.sarimax import SARIMAX adf = alf(s, log=1, freq='Q').loc[19591201:20171231] adf.index = pd.DatetimeIndex(adf.index.astype(str), freq='infer') arima = SARIMAX(adf, order=(2, 1, 0), trend='c').fit() fig = arima.plot_diagnostics(figsize=(10, 6)) plt.tight_layout(pad=2) plt.savefig(os.path.join(imgdir, 'ar.jpg')) plt.show() arima.summary() # Forecasting ## One-step ahead predictions model = AutoReg(df_train, lags=lags, old_names=False).fit() print(model.summary()) # Observations to predict are from the test split from sklearn.metrics import mean_squared_error all_dates = AutoReg(df, lags=lags, old_names=False) df_pred = all_dates.predict(model.params, start=df_train.index[-1]).shift(1).iloc[1:] mse = mean_squared_error(df_test, df_pred) var = np.mean(np.square(df_test - df_train.mean())) print(f"Short-term Forecasts: rmse={np.sqrt(mse):.6f} r2={1-mse/var:.4f}") fig, ax = plt.subplots(clear=True, num=1, figsize=(4, 6))
class SARIMAXModel(ModelStrategy): ''' A class for a Seasonal Autoregressive Integrated Moving Average Model and the standard operations on it ''' def __init__(self, hparams, log_dir=None): univariate = True model = None name = 'SARIMAX' self.auto_params = hparams.get('AUTO_PARAMS', False) self.trend_p = int(hparams.get('TREND_P', 10)) self.trend_d = int(hparams.get('TREND_D', 2)) self.trend_q = int(hparams.get('TREND_Q', 0)) self.seasonal_p = int(hparams.get('SEASONAL_P', 5)) self.seasonal_d = int(hparams.get('SEASONAL_D', 2)) self.seasonal_q = int(hparams.get('SEASONAL_Q', 0)) self.m = int(hparams.get('M', 12)) super(SARIMAXModel, self).__init__(model, univariate, name, log_dir=log_dir) def fit(self, dataset): ''' Fits a SARIMAX forecasting model :param dataset: A Pandas DataFrame with 2 columns: Date and Consumption ''' if dataset.shape[1] != 2: raise Exception( 'Univariate models cannot fit with datasets with more than 1 feature.' ) dataset.rename(columns={ 'Date': 'ds', 'Consumption': 'y' }, inplace=True) series = dataset.set_index('ds') if self.auto_params: best_model = pmdarima.auto_arima( series, seasonal=True, stationary=False, m=self.m, information_criterion='aic', max_order=2 * (self.p + self.q), max_p=2 * self.p, max_d=2 * self.d, max_q=2 * self.q, max_P=2 * self.p, max_D=2 * self.d, max_Q=2 * self.q, error_action='ignore' ) # Automatically determine model parameters order = best_model.order seasonal_order = best_model.seasonal_order print("Best SARIMAX params: (p, d, q):", best_model.order, " and (P, D, Q, s):", best_model.seasonal_order) else: order = (self.trend_p, self.trend_d, self.trend_q) seasonal_order = (self.seasonal_p, self.seasonal_d, self.seasonal_q, self.m) self.model = SARIMAX(series, order=order, seasonal_order=seasonal_order, enforce_stationarity=True, enforce_invertibility=True).fit() print(self.model.summary()) return def evaluate(self, train_set, test_set, save_dir=None, plot=False): ''' Evaluates performance of SARIMAX model on test set :param train_set: A Pandas DataFrame with 2 columns: Date and Consumption :param test_set: A Pandas DataFrame with 2 columns: Date and Consumption :param save_dir: Directory in which to save forecast metrics :param plot: Flag indicating whether to plot the forecast evaluation ''' train_set.rename(columns={ 'Date': 'ds', 'Consumption': 'y' }, inplace=True) test_set.rename(columns={ 'Date': 'ds', 'Consumption': 'y' }, inplace=True) train_set = train_set.set_index('ds') test_set = test_set.set_index('ds') train_set["model"] = self.model.fittedvalues test_set["forecast"] = self.forecast( test_set.shape[0])['Consumption'].tolist() df_forecast = train_set.append(test_set).rename(columns={'y': 'gt'}) test_metrics = self.evaluate_forecast(df_forecast, save_dir=save_dir, plot=plot) return test_metrics def forecast(self, days, recent_data=None): ''' Create a forecast for the test set. Note that this is different than obtaining predictions for the test set. The model makes a prediction for the provided example, then uses the result for the next prediction. Repeat this process for a specified number of days. :param days: Number of days into the future to produce a forecast for :param recent_data: A factual example for the first prediction :return: An array of predictions ''' forecast_df = self.model.forecast(steps=days).reset_index(level=0) forecast_df.columns = ['Date', 'Consumption'] return forecast_df def save(self, save_dir, scaler_dir=None): ''' Saves the model to disk :param save_dir: Directory in which to save the model ''' if self.model: model_path = os.path.join(save_dir, self.name + self.train_date + '.pkl') self.model.save(model_path) # Serialize and save the model object def load(self, model_path, scaler_path=None): ''' Loads the model from disk :param model_path: Path to saved model ''' if os.path.splitext(model_path)[1] != '.pkl': raise Exception('Model file path for ' + self.name + ' must have ".pkl" extension.') self.model = SARIMAXResults.load(model_path) return
# ### Building and Training the model # In[90]: #marking start time for model training start_time = time.time() # fit model # seasonal effects and Integrating is set to zero to convert SARIMAX into ARMAX ARMAX = SARIMAX(Y_train, exog=X_train_fs, order=(0, 0, 0)).fit(disp=False) #printing time taken to train the model print("--- %s seconds ---" % (time.time() - start_time)) #print model summary stats print(ARMAX.summary()) # make prediction Y_predict_ARMAX = ARMAX.predict(1, len(Y_test), exog=X_test_fs) # ### Testing the model using validation data # In[91]: # Generating a plot for actual and predicted values plt.figure(figsize=[10, 4]) plt.plot(Y_test, "b-", label="Actual Value of Y") plt.plot(Y_predict_ARMAX, "g-", label="Predicted Value of Y") plt.title("Predicition using ARIMAX model") plt.grid(True) plt.legend()