def model(data_train_grouped_sales): grouped_sales_deseason = data_train_grouped_sales.diff(periods=12) # 1,1,1 ARIMA Model model = smt.SARIMAX(data_train_grouped_sales.values, order=(2, 1, 1), seasonal_order=(1, 0, 0, 12)).fit() print(model.summary()) return (model)
def iterative_SARIMA_fit(ts, max_ar=2, max_dff=1, max_ma=2, s_max_ar=2, s_max_diff=1, s_max_ma=2, s=7): """ Iterates within the allowed values of the p and q parameters Returns a dictionary with the successful fits. Keys correspond to models. """ ts = ts.astype('float') SARIMA_fit_results = {} min_aic = np.inf min_aic_fit_order = None min_aic_fit_res = None for AR in range(max_ar + 1): for Diff in range(max_dff + 1): for MA in range(max_ma + 1): for sAR in range(s_max_ar + 1): for sDiff in range(s_max_diff + 1): for sMA in range(s_max_ma + 1): model = smt.SARIMAX(ts, order=(AR, Diff, MA), seasonal_order=(sAR, sDiff, sMA, s)) try: results_SARIMA = model.fit(disp=False, method='lbfgs') fit_is_available = True except: # print("\tDidn't find a fit") continue if fit_is_available: # print("\tFound a fit (%d,%d,%d)" % (AR, Diff, MA)) # print("\tAIC score =", results_ARIMA.aic) SARIMA_fit_results[ '%d-%d-%d--%d-%d-%d-%d' % ( AR, Diff, MA, sAR, sDiff, sMA, s)] = \ results_SARIMA if results_SARIMA.aic < min_aic: min_aic = results_SARIMA.aic min_aic_fit_order = (AR, Diff, MA, sAR, sDiff, sMA, s) # min_aic_fit_res = ARIMA_fit_results min_aic_fit_res = results_SARIMA return SARIMA_fit_results, min_aic, min_aic_fit_order, min_aic_fit_res
def arima_predictor_ver2(df_ts): series = df_ts['orders'] sarimax_mod = tsa.SARIMAX(endog=series, order=(2,1,0), seasonal_order=(1,1,0,12)) sarimax_res = sarimax_mod.fit() sarimax_res.summary() predict, cov, ci, idx = sarimax_res.predict(alpha=0.05, start=0, end=len(series)) # show forecast print predict # show problematic value in forecast print predict[0][12]
def fit_sarimax(ts_train, ts_test, order=(1, 0, 1), seasonal_order=(0, 0, 0, 0), exog_train=None, exog_test=None, figsize=(15, 10)): ## checks check_trend = "Trend parameters: No differencing" if order[ 1] == 0 else "Trend parameters: d=" + str(order[1]) print(check_trend) check_seasonality = "Seasonal parameters: No Seasonality" if ( seasonal_order[3] == 0) & (np.sum(seasonal_order[0:3]) == 0) else "Seasonal parameters: Seasonality every " + str( seasonal_order[3]) + " observations" print(check_seasonality) check_exog = "Exog parameters: Not given" if (exog_train is None) & ( exog_test is None) else "Exog parameters: number of regressors=" + str( exog_train.shape[1]) print(check_exog) ## train model = smt.SARIMAX(ts_train, order=order, seasonal_order=seasonal_order, exog=exog_train, enforce_stationarity=False, enforce_invertibility=False).fit() dtf_train = ts_train.to_frame(name="ts") dtf_train["model"] = model.fittedvalues ## test dtf_test = ts_test.to_frame(name="ts") dtf_test["forecast"] = model.predict(start=len(ts_train), end=len(ts_train) + len(ts_test) - 1, exog=exog_test) ## evaluate dtf = dtf_train.append(dtf_test) title = "ARIMA " + str(order) if exog_train is None else "ARIMAX " + str( order) title = "S" + title + " x " + str(seasonal_order) if np.sum( seasonal_order) > 0 else title dtf = utils_evaluate_forecast(dtf, figsize=figsize, title=title) return dtf, model
def fit_garch(ts_train, ts_test, order=(1,0,1), seasonal_order=(0,0,0,0), exog_train=None, exog_test=None, figsize=(15,10)): ## train arima = smt.SARIMAX(ts_train, order=order, seasonal_order=seasonal_order, exog=exog_train, enforce_stationarity=False, enforce_invertibility=False).fit() garch = arch.arch_model(arima.resid, p=order[0], o=order[1], q=order[2], x=exog_train, dist='StudentsT', power=2.0, mean='Constant', vol='GARCH') model = garch.fit(update_freq=seasonal_order[3]) dtf_train = ts_train.to_frame(name="ts") dtf_train["model"] = model.conditional_volatility ## test dtf_test = ts_test.to_frame(name="ts") dtf_test["forecast"] = model.forecast(horizon=len(ts_test)) ## evaluate dtf = dtf_train.append(dtf_test) title = "GARCH ("+str(order[0])+","+str(order[2])+")" if order[0] != 0 else "ARCH ("+str(order[2])+")" dtf = utils_evaluate_forecast(dtf, figsize=figsize, title=title) return dtf, model
def rolling_window(s,w,n,df): """ s : start num w : window size n : predict number """ pred_df = pd.DataFrame(columns=["passengers"]) for i in range(s,s+n) : print(i, " / " , s+n) train_df = df[i:i+w] if train_df.__len__() < w: df = df.append(pred_df.iloc[-1]) train_df = df[i:i + w] m = tsa.SARIMAX(train_df, order=(1,1,1),seasonal_order=(1,1,1,12), enforce_stationarity=False, enforce_invertibility=False).fit() forecast_1 = pd.DataFrame({"passengers" : m.forecast(steps=1)}) pred_df = pd.concat([pred_df,forecast_1], axis=0) return pred_df
def sarimax_statsmodels(timeseries, train_length, o, so): """ Previsioni con il modello SARIMAX Parameters ---------- timeseries : Series la serie temporale. train_length : int la lunghezza del set di train (in rapporto alla serie completa). o : iterable order del modello SARIMAX (per statsmodels). so : iterable seasonal_order del modello SARIMAX (per statsmodels). Returns ------- None. """ # controllo se i dati sono settimanali o giornalieri if so[3] == 52: f = 'W-MON' else: f = 'D' # creo il set di train train = timeseries[pd.date_range( start=timeseries.index[0], end=timeseries.index[int(len(timeseries) * train_length) - 1], freq=f)] # adatto il modello ai dati model = smt.SARIMAX(train, order=o, seasonal_order=so, trend='c').fit() #model = pm.auto_arima(train, seasonal=True, m=m, suppress_warnings=True, trace=True, #start_p=1, start_q=1, max_p=1, max_q=1, start_P=1, start_Q=1, max_P=1, max_Q=1) # stampo i parametri del modello e controllo la sua bontà print(model.summary()) plt.figure(figsize=(40, 20), dpi=80) model.plot_diagnostics(figsize=(40, 20)) plt.show() # predizioni in-sample # https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.sarimax.SARIMAXResults.get_prediction.html sarimax_mod = model.get_prediction(end=len(train) - 1, dynamic=False) sarimax_dates = pd.date_range(start=timeseries.index[0], end=timeseries.index[len(train) - 1], freq=f) sarimax_ts = pd.Series(sarimax_mod.predicted_mean, index=sarimax_dates) # predizioni out-of-sample # https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.sarimax.SARIMAXResults.get_forecast.html fcast = model.get_forecast(steps=len(timeseries) - len(train)) fcast_ci = fcast.conf_int() fcast_dates = pd.date_range(start=timeseries.index[len(train)], periods=len(timeseries) - len(train), freq=f) ts_fcast = pd.Series(fcast.predicted_mean, index=fcast_dates) # grafico del modello plt.figure(figsize=(40, 20), dpi=80) plt.title('Modello SARIMAX{}x{} per {}'.format(o, so, timeseries.name)) ax = train.plot(label='Train set', color='black') sarimax_ts.plot(ax=ax, label='In-sample predictions', color='green') plt.legend() plt.show() # grafico delle previsioni plt.figure(figsize=(40, 20), dpi=80) plt.title('Forecasting con SARIMAX{}x{} per {}'.format( o, so, timeseries.name)) ax = timeseries.plot(label='Observed', color='black') ts_fcast.plot(ax=ax, label='Out-of-sample forecasts', alpha=.7, color='red') ax.fill_between(fcast_dates, fcast_ci['lower ' + timeseries.name], fcast_ci['upper ' + timeseries.name], color='k', alpha=.2) plt.legend() plt.show() # metriche di errore errore = ts_fcast - timeseries errore.dropna(inplace=True) print('MSE=%.4f' % (errore**2).mean()) print('MAE=%.4f' % (abs(errore)).mean())
aic_df = pd.DataFrame.from_dict(myDict, orient="index") aic_df.columns = ["aic", "bic", "order", "s_order"] aic_df["aic"].plot() aic_df2 = aic_df[aic_df["aic"] < 335] aic_df2["aic"].plot() dir(best["model"]) best["model"].summary() best["model"].plot_diagnostics() (2,1,0,1,1,0) m = tsa.SARIMAX(train_df_log,order=(1,1,1),seasonal_order=(1,1,1,12)).fit() m.summary() m.plot_diagnostics() train_df_log model_df = train_df_log.copy() model_df["yhat"] = m.fittedvalues model_df = model_df.iloc[1:,:] model_df.iloc[:,:].plot() m.fittedvalues m_test_df = test_df_log.copy() m_test_df["yhat"] = m.predict(start=test_df_log.index[0], end=test_df_log.index[-1]) m_test_df["resid"] = m_test_df["passengers"]-m_test_df["yhat"] m_test_df["resid"].plot()
def model_gridsearch( ts, p_min, d_min, q_min, p_max, d_max, q_max, sP_min, sD_min, sQ_min, sP_max, sD_max, sQ_max, trends, exog=None, s=None, enforce_stationarity=True, enforce_invertibility=True, simple_differencing=False, plot_diagnostics=False, verbose=False, filter_warnings=True, ): '''Run grid search of SARIMAX models and save results. ''' cols = [ 'p', 'd', 'q', 'sP', 'sD', 'sQ', 's', 'trend', 'enforce_stationarity', 'enforce_invertibility', 'simple_differencing', 'aic', 'bic', 'het_p', 'norm_p', 'sercor_p', 'dw_stat', 'arroots_gt_1', 'maroots_gt_1', 'datetime_run' ] # Initialize a DataFrame to store the results df_results = pd.DataFrame(columns=cols) # # Initialize a DataFrame to store the results # results_bic = pd.DataFrame(index=['AR{}'.format(i) for i in range(p_min,p_max+1)], # columns=['MA{}'.format(i) for i in range(q_min,q_max+1)]) mod_num = 0 for trend, p, d, q, sP, sD, sQ in itertools.product( trends, range(p_min, p_max + 1), range(d_min, d_max + 1), range(q_min, q_max + 1), range(sP_min, sP_max + 1), range(sD_min, sD_max + 1), range(sQ_min, sQ_max + 1), ): print(p, d, q, sP, sD, sQ, end='\r') # initialize to store results for this parameter set this_model = pd.DataFrame(index=[mod_num], columns=cols) if p == 0 and d == 0 and q == 0: continue try: model = smt.SARIMAX(ts, trend=trend, order=(p, d, q), seasonal_order=(sP, sD, sQ, s), enforce_stationarity=enforce_stationarity, enforce_invertibility=enforce_invertibility, simple_differencing=simple_differencing, exog=exog) if filter_warnings is True: with warnings.catch_warnings(): warnings.filterwarnings("ignore") model_results = model.fit(disp=0) else: model_results = model.fit() if verbose: print(model_results.summary()) if plot_diagnostics: model_results.plot_diagnostics() stat = model_resid_stats(model_results, verbose=verbose) this_model.loc[mod_num, 'p'] = p this_model.loc[mod_num, 'd'] = d this_model.loc[mod_num, 'q'] = q this_model.loc[mod_num, 'sP'] = sP this_model.loc[mod_num, 'sD'] = sD this_model.loc[mod_num, 'sQ'] = sQ this_model.loc[mod_num, 's'] = s this_model.loc[mod_num, 'trend'] = trend this_model.loc[mod_num, 'enforce_stationarity'] = enforce_stationarity this_model.loc[mod_num, 'enforce_invertibility'] = enforce_invertibility this_model.loc[mod_num, 'simple_differencing'] = simple_differencing this_model.loc[mod_num, 'aic'] = model_results.aic this_model.loc[mod_num, 'bic'] = model_results.bic # this_model.loc[mod_num, 'het_method'] = stat['het_method'] # this_model.loc[mod_num, 'het_stat'] = stat['het_stat'] this_model.loc[mod_num, 'het_p'] = stat['het_p'] # this_model.loc[mod_num, 'norm_method'] = stat['norm_method'] # this_model.loc[mod_num, 'norm_stat'] = stat['norm_stat'] this_model.loc[mod_num, 'norm_p'] = stat['norm_p'] # this_model.loc[mod_num, 'skew'] = stat['skew'] # this_model.loc[mod_num, 'kurtosis'] = stat['kurtosis'] # this_model.loc[mod_num, 'sercor_method'] = stat['sercor_method'] # this_model.loc[mod_num, 'sercor_stat'] = stat['sercor_stat'] this_model.loc[mod_num, 'sercor_p'] = stat['sercor_p'] this_model.loc[mod_num, 'dw_stat'] = stat['dw_stat'] this_model.loc[ mod_num, 'arroots_gt_1'] = stat['arroots_outside_unit_circle'] this_model.loc[ mod_num, 'maroots_gt_1'] = stat['maroots_outside_unit_circle'] this_model.loc[mod_num, 'datetime_run'] = pd.to_datetime( 'today').strftime('%Y-%m-%d %H:%M:%S') df_results = df_results.append(this_model) mod_num += 1 except: continue return df_results
def process(path): df_w = pd.read_csv(path, index_col='Date', parse_dates=True) df_w = df_w[df_w.index > '2017-01-01'] df_w.head() df_w = df_w[['Close']] plt.plot(df_w['Close'], label='Close') plt.title('Stock ' + str(df_w.index[0]).split(' ')[0] + ' to ' + str(df_w.index[-1]).split(' ')[0], fontsize=20) plt.xlabel('Days', fontsize=15) plt.ylabel('Closing Stock', fontsize=15) plt.legend(loc='upper left') fig = plt.gcf() fig.set_size_inches(16.5, 4.5) fig.savefig("static/results/livechart.png") decomposition = seasonal_decompose(df_w, model='additive', two_sided=False, freq=52) trend = decomposition.trend seasonal = decomposition.seasonal resid = decomposition.resid plt.plot(trend, label='Trend') plt.xlabel('Days', fontsize=15) plt.title('Stock Trend', fontsize=20) plt.ylabel('Stock Values', fontsize=15) plt.xticks(rotation=90) plt.legend(loc='upper left') fig = plt.gcf() fig.set_size_inches(16.5, 4.5) fig.savefig("static/results/Trend.png") plt.plot(seasonal, label='Seasonality') plt.xlabel('Days', fontsize=15) plt.ylabel('Days', fontsize=15) plt.title('Stock Seasonality', fontsize=20) plt.xticks(rotation=90) plt.legend(loc='upper left') fig = plt.gcf() fig.set_size_inches(15, 4.5) fig.savefig("static/results/Seasonality.png") ###### Adjusting Outliers ####### wnd = 20 df_w['RollingStd'] = df_w['Close'].rolling(window=wnd).std() df_w['Rollingmean'] = df_w['Close'].rolling(window=wnd).mean() st = df_w['RollingStd'][wnd] mn = df_w['Rollingmean'][wnd] for i in range(wnd + 1, len(df_w)): if df_w['RollingStd'][i] - st > st: df_w['RollingStd'][i] = st * 1.96 df_w['Close'][i] = mn + st * 1.96 if mn > df_w['Rollingmean'][i]: df_w['Rollingmean'][i] = mn - st else: df_w['Rollingmean'][i] = mn + st st = df_w['RollingStd'][i] mn = df_w['Rollingmean'][i] else: st = df_w['RollingStd'][i] mn = df_w['Rollingmean'][i] plt.plot(df_w['Rollingmean'], label='Rolling Mean') plt.plot(df_w['Close'][wnd:], label='Close') plt.xlabel('Days', fontsize=15) plt.ylabel('Stock values', fontsize=15) plt.title('Rolling Stats', fontsize=20) plt.xticks(rotation=90) plt.legend(loc='upper left') fig = plt.gcf() fig.set_size_inches(15, 4.5) fig.savefig("static/results/Rolling Stats.png") plt.plot(df_w['RollingStd'], label='Rolling STD') plt.legend(loc='upper left') fig = plt.gcf() fig.set_size_inches(15, 4.5) fig.savefig("static/results/Rolling STD.png") ############## Revenue Time series ACF and PACF Charts #################### df_w = df_w[['Close']] lag_acf = acf(df_w, nlags=20) lag_pacf = pacf(df_w, nlags=20, method='ols') #################### Looking at charts above we can create a differenced AR model of order 1 ################### ############### Run SARIMA Model ################### train = df_w['Close'][0:-10] test = df_w['Close'][len(train):] p = 1 d = 0 q = 0 pp = 0 dd = 1 qq = 0 z = 52 aic = 'null' amape = 99 af = [] try: model = smt.SARIMAX(train.asfreq(freq='1d'), exog=None, order=(p, d, q), seasonal_order=(pp, dd, qq, z), trend='n').fit() aic = model.aic aic = round(aic, 2) pred = model.get_forecast(len(test)) fcst = pred.predicted_mean fcst.index = test.index mapelist = [] for i in range(len(fcst)): mapelist.insert(i, (np.absolute(test[i] - fcst[i])) / test[i]) mape = np.mean(mapelist) * 100 mape = round(mape, 2) except: mape = 9999 pass amape = mape sap = p sad = d saq = q app = pp add = dd aqq = qq az = z af = fcst mse = mean_squared_error(test, af) rmse = np.sqrt(mse) rmse = round(rmse, 1) plt.plot(train) plt.plot(test, label='Actual') plt.plot(af, label='Predicted') fig = plt.gcf() fig.set_size_inches(15, 5.5) plt.title("Existing Prediction", fontsize=20) plt.legend(loc='upper left') plt.xlabel('Weeks', fontsize=15) fig.savefig("static/results/Previous.png") model = smt.SARIMAX(df_w.asfreq(freq='1d'), exog=None, order=(sap, sad, saq), seasonal_order=(app, add, aqq, az)).fit() pred = model.get_forecast(10) cf = pred.conf_int(alpha=0.05) ax = df_w.plot(label='observed', figsize=(16.5, 5.5)) pred.predicted_mean.plot(ax=ax, label='Forecast') ax.fill_between(cf.index, cf.iloc[:, 0], cf.iloc[:, 1], color='k', alpha=.25) ax.set_xlabel('Days', fontsize=15) ax.set_ylabel('Stock Price', fontsize=15) plt.legend(loc='upper left') plt.title("Forecasts from " + str(cf.index[0]).split(' ')[0] + " to " + str(cf.index[-1]).split(' ')[0], fontsize=20) fig = plt.gcf() fig.set_size_inches(15, 5.5) fig.savefig("static/results/Forecast.png") print(pred.predicted_mean) print(type(pred.predicted_mean)) fcst = pred.conf_int(alpha=0.05) fcst['Forecast'] = pred.predicted_mean fcst = fcst.round(1) forecast = pd.DataFrame() forecast['Lower Price'] = fcst.apply( lambda x: "{:,}".format(x['lower Close']), axis=1) forecast['Upper Price'] = fcst.apply( lambda x: "{:,}".format(x['upper Close']), axis=1) forecast['Forecast'] = fcst.apply(lambda x: "{:,}".format(x['Forecast']), axis=1) return pred.predicted_mean
# $$y_t = c + e_t + \theta_1 e_{t-1} + \theta_2 e_{t-2} + \ldots + \theta_q e_{t-q}$$ # # Here the coefficients are residuals from previous predictions. # ##### Combine # $$\Delta y_t = c + \phi_1 \Delta y_{t-1} + \theta_t e_{t-1} + e_t$$ # # Using lag notation, where $L y_t = y_{t-1}$, i.e. y.shift() in pandas, we can rewrite that as # # $$(1 - \phi_1 L) (1 - L)y_t = c + (1 + \theta L)e_t$$ # # for our specific `ARIMA(1, 1, 1)` model mod = smt.SARIMAX(y, trend='c', order=(1, 1, 1)) res = mod.fit() tsplot(res.resid[2:], lags=24); res.summary() # Looks better, # but still needs seasonality adjustment. # # Seasonal ARIMA model is written as # $\mathrm{ARIMA}(p,d,q)×(P,D,Q)_s$. # Lowercase letters are non-seasonal components. # Upper-case letters are similar specification for seasonal component— # where $s$ is the periodicity # (4 quarterly, 12 monthly). #
test_results.info() test_results.dropna() sns.heatmap(test_results.RMSE.unstack().mul(10), fmt='.2', annot=True, cmap='Blues_r') plt.show() plt.savefig(f'{p}3.png') sns.heatmap(test_results.BIC.unstack(), fmt='.2f', annot=True, cmap='Blues_r') plt.show() model = tsa.ARMA(endog=industrial_production_log_diff, order=(0, 4)).fit() print(model.summary()) plot_correlogram(model.resid) plt.show() plt.savefig(f'{p}4.png') print(df[['RMSE', 'AIC', 'BIC']].sort_values('RMSE').head()) df[['RMSE', 'AIC', 'BIC']].corr('spearman') sns.jointplot(y='RMSE', x='BIC', data=df[['RMSE', 'BIC']].rank()) df[(df.RMSE < df.RMSE.quantile(.05)) & (df.BIC < df.BIC.quantile(.1))] best_model = tsa.SARIMAX(endog=industrial_production_log_diff, order=(2, 0, 3), seasonal_order=(1, 0, 0, 12)).fit() print(best_model.summary()) plot_correlogram(best_model.resid, lags=20, title='Residuals') plt.show() plt.savefig(f'{p}5.png')
phi, Phi = 0, 0 theta, Theta = 0.5, 0.8 ar_params = np.array([]) ma_params = np.array( [theta, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Theta, theta * Theta]) ar, ma = np.r_[1, -ar_params], np.r_[1, ma_params] y = sm.tsa.ArmaProcess(ar, ma).generate_sample(500, burnin=50) time_series = pd.Series(y) train = time_series[:400] test = time_series[400:] p, d, q = 0, 0, 1 P, D, Q = 0, 0, 1 model = smt.SARIMAX(train, order=(p, d, q), seasonal_order=(P, D, Q, 12)).fit(trend='c') prediction_train = model.predict() prediction_test = model.get_forecast(len(test)).predicted_mean prediction_test_bound = model.get_forecast(len(test)).conf_int() _, axes = plt.subplots(1, 1, figsize=(12, 5)) axes.plot(test.index, prediction_test, c='r', label='predict') axes.fill_between(pd.DataFrame(prediction_test_bound, index=test.index).index, pd.DataFrame(prediction_test_bound, index=test.index).iloc[:, 0], pd.DataFrame(prediction_test_bound, index=test.index).iloc[:, 1], color='k', alpha=0.15) train.plot(label='train', ax=axes)
plot_model(y1[y_test.index[0]:], ar_yhat) # %% #TODO need to define the p,q vals # arma_model = sm.ARMA(y1,?,exog=exog).fit() # print('ARMA Summary') # arma_model.summary() # plot_model(y1,arma_model.fittedvalues) # %% #TODO is our data stationary? #TODO need to define the p,q vals # arima_model = sm.ARIMA(y1,exog=exog).fit() # print('ARMIA Summary') # arima_model.summary() # plot_model(y1,arima_model.fittedvalues) #%% sarimax_model = sm.SARIMAX(y1, exog=exog).fit() print('SARIMAX Summary:') sarimax_model.summary() plot_model(y1, sarimax_model.fittedvalues) # %% #Basic Attempt at Markov Chain markov_model = sm.MarkovRegression(y1, k_regimes=3, trend='nc', switching_variance=True).fit() markov_model.summary() #%% #plot markov fig, axes = plt.subplots(2, figsize=(20, 7)) axes[0].plot(markov_model.filtered_marginal_probabilities[0])
# remove trend, hetero # df["log"] = np.log(df["passengers"]) # df["log_diff"] = df["log"].diff() # df["log_diff"].plot() # df.dropna(inplace=True) # split the data into train and test num = int(df.shape[0] * 2 / 3) train_df = df.iloc[:num, :] test_df = df.drop(index=train_df.index) # train_df_log = np.log(train_df) test_df_log = np.log(test_df) m1 = tsa.SARIMAX(train_df_log, order=(1, 1, 0)).fit() m1.summary() m1_df = train_df_log.copy() m1_df["dx"] = m1_df["passengers"].diff() m1_df["dxhat"] = m1.params[0] * m1_df["dx"].shift(1) m1_df["xhat"] = m1_df["dxhat"] + m1_df["passengers"].shift(1) m1_df["m_fit"] = m1.fittedvalues m1_df["err"] = m1_df["xhat"] - m1_df["m_fit"] m1_df["err"].iloc[3:].plot() m1 = tsa.SARIMAX(train_df_log, order=(1, 1, 1)).fit() m1_df = train_df_log.copy() m1_df["dx"] = m1_df["passengers"].diff() m1_df["res"] = m1.resid
# time_series = web.DataReader('IPGMFN', 'fred', '1988', '2017-12').squeeze().dropna() # time_series_log = np.log(time_series) # time_series_log_diff = time_series_log.diff(12).dropna() return (time_series, time_series_log, time_series_log_diff) time_series, time_series_log, time_series_log_diff = get_data() ''' SARIMAX ''' model1 = tsa.statespace.SARIMAX(time_series_log, order=(2, 0, 2), seasonal_order=(0, 1, 0, 12)).fit() print(model1.summary()) plot_model_summary(model1.summary(), title='ARMA_model_summary_1') model2 = tsa.statespace.SARIMAX(time_series_log_diff, order=(2, 0, 2), seasonal_order=(0, 0, 0, 12)).fit() print(model2.summary()) plot_model_summary(model2.summary(), title='SARIMAX_model_summary_1') print(model1.params.to_frame('SARIMAX').join(model2.params.to_frame('diff'))) best_model = tsa.SARIMAX(endog=time_series_log_diff, order=(2, 0, 3), seasonal_order=(1, 0, 0, 12)).fit() print(best_model.summary()) plot_model_summary(best_model.summary(), title='best_SARIMAX_model_summary') plot_correlogram(best_model.resid, lags=20, title='Residuals_SARIMAX')
#---- ch04/import-tsa import statsmodels.tsa.api as tsa #---- ch04/acf/plot tsa.graphics.plot_acf(y) plt.show() #---- ch04/pacf/plot tsa.graphics.plot_pacf(y) plt.show() #---- ch04/ar-estimate mod = tsa.SARIMAX(y, order=(2, 0, 0)) result = mod.fit() #---- ch04/ar-params result.params #---- ch04/ar-summary/dnr result.summary()
def example_3(): import pandas_datareader as pdr gs = pdr.data.DataReader("GS", data_source='yahoo', start='2006-01-01', end='2010-01-01') print(gs.head().round(2)) print(gs.loc[pd.Timestamp('2006-01-01'):pd.Timestamp('2006-12-31')].head()) print(gs.loc['2006'].head()) #-------------------- # Resampling. if True: print(gs.resample("5d").mean().head()) print(gs.resample("W").agg(['mean', 'sum']).head()) # You can up-sample to convert to a higher frequency. The new points are filled with NaNs. print(gs.resample("6h").mean().head()) #-------------------- # Rolling, expanding, exponential weighted (EW). if False: gs.Close.plot(label='Raw') gs.Close.rolling(28).mean().plot(label='28D MA') gs.Close.expanding().mean().plot(label='Expanding Average') gs.Close.ewm(alpha=0.03).mean().plot(label='EWMA($\\alpha=.03$)') plt.legend(bbox_to_anchor=(1.25, .5)) plt.tight_layout() plt.ylabel("Close ($)") sns.despine() # Each of .rolling, .expanding, and .ewm return a deferred object, similar to a GroupBy. roll = gs.Close.rolling(30, center=True) m = roll.agg(['mean', 'std']) plt.figure() ax = m['mean'].plot() ax.fill_between(m.index, m['mean'] - m['std'], m['mean'] + m['std'], alpha=.25) plt.tight_layout() plt.ylabel("Close ($)") sns.despine() #-------------------- # Grab bag. if False: # Offsets. # These are similar to dateutil.relativedelta, but works with arrays. print(gs.index + pd.DateOffset(months=3, days=-2)) # Holiday calendars. from pandas.tseries.holiday import USColumbusDay print(USColumbusDay.dates('2015-01-01', '2020-01-01')) # Timezones. # tz naiive -> tz aware..... to desired UTC print(gs.tz_localize('US/Eastern').tz_convert('UTC').head()) #-------------------- # Modeling time series. if True: from collections import namedtuple import statsmodels.formula.api as smf import statsmodels.tsa.api as smt import statsmodels.api as sm from modern_pandas_utils import download_timeseries def download_many(start, end): months = pd.period_range(start, end=end, freq='M') # We could easily parallelize this loop. for i, month in enumerate(months): download_timeseries(month) def time_to_datetime(df, columns): ''' Combine all time items into datetimes. 2014-01-01,1149.0 -> 2014-01-01T11:49:00 ''' def converter(col): timepart = (col.astype(str) .str.replace('\.0$', '') # NaNs force float dtype .str.pad(4, fillchar='0')) return pd.to_datetime(df['fl_date'] + ' ' + timepart.str.slice(0, 2) + ':' + timepart.str.slice(2, 4), errors='coerce') return datetime_part df[columns] = df[columns].apply(converter) return df def unzip_one(fp): try: zf = zipfile.ZipFile(fp) csv = zf.extract(zf.filelist[0]) return csv except zipfile.BadZipFile as ex: print('zipfile.BadZipFile raised in {}: {}.'.format(fp, ex)) raise def read_one(fp): df = (pd.read_csv(fp, encoding='latin1') .rename(columns=str.lower) .drop('unnamed: 6', axis=1) .pipe(time_to_datetime, ['dep_time', 'arr_time', 'crs_arr_time', 'crs_dep_time']) .assign(fl_date=lambda x: pd.to_datetime(x['fl_date']))) return df store = './modern_pandas_data/ts.hdf5' if not os.path.exists(store): download_many('2000-01-01', '2016-01-01') zips = glob.glob(os.path.join('modern_pandas_data', 'timeseries', '*.zip')) csvs = [unzip_one(fp) for fp in zips] dfs = [read_one(fp) for fp in csvs] df = pd.concat(dfs, ignore_index=True) df['origin'] = df['origin'].astype('category') df.to_hdf(store, 'ts', format='table') else: df = pd.read_hdf(store, 'ts') with pd.option_context('display.max_rows', 100): print(df.dtypes) daily = df.fl_date.value_counts().sort_index() y = daily.resample('MS').mean() print(y.head()) ax = y.plot() ax.set(ylabel='Average Monthly Flights') sns.despine() X = (pd.concat([y.shift(i) for i in range(6)], axis=1, keys=['y'] + ['L%s' % i for i in range(1, 6)]).dropna()) print(X.head()) mod_lagged = smf.ols('y ~ trend + L1 + L2 + L3 + L4 + L5', data=X.assign(trend=np.arange(len(X)))) res_lagged = mod_lagged.fit() res_lagged.summary() sns.heatmap(X.corr()) ax = res_lagged.params.drop(['Intercept', 'trend']).plot.bar(rot=0) plt.ylabel('Coefficeint') sns.despine() # Autocorrelation. # 'Results.resid' is a series of residuals: y - ŷ. mod_trend = sm.OLS.from_formula('y ~ trend', data=y.to_frame(name='y').assign(trend=np.arange(len(y)))) res_trend = mod_trend.fit() def tsplot(y, lags=None, figsize=(10, 8)): fig = plt.figure(figsize=figsize) layout = (2, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (1, 0)) pacf_ax = plt.subplot2grid(layout, (1, 1)) y.plot(ax=ts_ax) smt.graphics.plot_acf(y, lags=lags, ax=acf_ax) smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax) [ax.set_xlim(1.5) for ax in [acf_ax, pacf_ax]] sns.despine() plt.tight_layout() return ts_ax, acf_ax, pacf_ax tsplot(res_trend.resid, lags=36) y.to_frame(name='y').assign(Δy=lambda x: x.y.diff()).plot(subplots=True) sns.despine() ADF = namedtuple("ADF", "adf pvalue usedlag nobs critical icbest") #ADF(*smt.adfuller(y))._asdict() ADF(*smt.adfuller(y.dropna()))._asdict() ADF(*smt.adfuller(y.diff().dropna()))._asdict() data = (y.to_frame(name='y').assign(Δy=lambda df: df.y.diff()).assign(LΔy=lambda df: df.Δy.shift())) mod_stationary = smf.ols('Δy ~ LΔy', data=data.dropna()) res_stationary = mod_stationary.fit() tsplot(res_stationary.resid, lags=24) # Seasonality. #smt.seasonal_decompose(y).plot() smt.seasonal_decompose(y.fillna(method='ffill')).plot() # ARIMA. mod = smt.SARIMAX(y, trend='c', order=(1, 1, 1)) res = mod.fit() tsplot(res.resid[2:], lags=24) res.summary() mod_seasonal = smt.SARIMAX(y, trend='c', order=(1, 1, 2), seasonal_order=(0, 1, 2, 12), simple_differencing=False) res_seasonal = mod_seasonal.fit() res_seasonal.summary() tsplot(res_seasonal.resid[12:], lags=24) # Forecasting. pred = res_seasonal.get_prediction(start='2001-03-01') pred_ci = pred.conf_int() plt.figure() ax = y.plot(label='observed') pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7) ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.2) ax.set_ylabel("Monthly Flights") plt.legend() sns.despine() pred_dy = res_seasonal.get_prediction(start='2002-03-01', dynamic='2013-01-01') pred_dy_ci = pred_dy.conf_int() plt.figure() ax = y.plot(label='observed') pred_dy.predicted_mean.plot(ax=ax, label='Forecast') ax.fill_between(pred_dy_ci.index, pred_dy_ci.iloc[:, 0], pred_dy_ci.iloc[:, 1], color='k', alpha=.25) ax.set_ylabel("Monthly Flights") # Highlight the forecast area. ax.fill_betweenx(ax.get_ylim(), pd.Timestamp('2013-01-01'), y.index[-1], alpha=.1, zorder=-1) ax.annotate('Dynamic $\\longrightarrow$', (pd.Timestamp('2013-02-01'), 550)) plt.legend() sns.despine() plt.show()