def get_best_arima(y): d_kpss = ndiffs(y, test="kpss") d_adf = ndiffs(y, test="adf") d_pp = ndiffs(y, test="pp") d_min = min([d_kpss, d_adf, d_pp]) d_max = min([d_kpss, d_adf, d_pp]) # Params from Tran et al. works # p = 0:4 # d = 0:1 # q = 0:2 # P = 0:1 # D = 0:1 # Q = 0:1 model = auto_arima( y, start_p=0, max_p=4, d=None, max_d=1, start_q=0, max_q=5, start_P=0, max_P=1, D=None, max_D=1, start_Q=0, max_Q=1, # seasonal=False, m=52, maxiter=100, n_jobs=1) print("Model order:", model.get_params()["order"]) return model
def test_issue_341(): seas_diffed = np.array([124., -114., -163., -83.]) with pytest.raises(ValueError) as ve: arima_utils.ndiffs(seas_diffed, test='adf') assert "raised from LinAlgError" in pytest_error_str(ve)
def stationarity_tests(data): ## function that performs stationarity test on data: """ Parameters: data: time series for which stationarity tests are performed """ return_dict = {'usual_differencing':{'ADF_test': ndiffs(data.values, test='adf'), 'KPSS_test': ndiffs(data.values, test='kpss'), 'PP_test': ndiffs(data.values, test='pp')}, 'seasonal_differencing': {'Canova-Hansen': nsdiffs(data.values, m=7, max_D=31,test='ch'), 'OCSB': nsdiffs(data.values, m=7, max_D=31,test='ocsb')}} return return_dict
def test_dataset(dataset): from pmdarima.arima.utils import ndiffs y = dataset.to_numpy() # Perform a test of stationarity for different levels of d to estimate the number of differences required to make a given time series stationary. ## Adf Test print('adf=', ndiffs(y, test='adf')) # 0 # KPSS test print('kpss=', ndiffs(y, test='kpss')) # 1 # PP test: print('pp=', ndiffs(y, test='pp')) # 0
def auto_pmd(self, train, test): ''' Summary Line: Create an auto_arima Extended ''' little_d = ndiffs(train, test='kpss') big_D = nsdiffs(train, m=52, max_D=12, test='ocsb') model_1 = aa.auto_arima(train, start_p=0, start_q=0, max_p=5, max_q=5, m=52, start_P=0, seasonal=True, d=little_d, D=big_D, suppress_warnings=True, stepwise=True, error_action='ignore', trace=False) predictions = model_1.predict(n_periods=len(test)) predictions = np.array(predictions) return predictions
def ndiff(self, tests=['kpss', 'adf', 'pp'], alpha=0.05, max_d=2): """Returns p-values to decide for the value of d-differentiation list of tests given in tests parameter are applied. """ try: assert sum([i in ['kpss', 'adf', 'pp'] for i in tests]) > 0 except AssertionError: self._uvts_cls_logger.exception( "Assertion exception occurred. No valid value for tests! " "Choose from ['kpss', 'adf', 'pp']. You can choose more than one." ) sys.exit("STOP") do_test = list( compress(['kpss', 'adf', 'pp'], [i in ['kpss', 'adf', 'pp'] for i in tests])) return dict( zip( do_test, list( map( lambda x: ndiffs( self.ts_df['y'], test=x, alpha=alpha, max_d=max_d), do_test))))
def test_non_default_kpss(): test = KPSSTest(alpha=0.05, null='trend', lshort=False) pval, do_diff = test.should_diff(austres) assert do_diff # show it is significant assert np.allclose(pval, 0.01, atol=0.005) # test the ndiffs with the KPSS test assert ndiffs(austres, test='kpss', max_d=2) == 2
def test_non_default_kpss(): test = KPSSTest(alpha=0.05, null='trend', lshort=False) pval, is_sig = test.is_stationary(austres) assert is_sig # show it is significant assert_almost_equal(pval, 0.01) # test the ndiffs with the KPSS test assert ndiffs(austres, test='kpss', max_d=2) == 2
def test_pp(): test = PPTest(alpha=0.05, lshort=True) pval, is_sig = test.is_stationary(austres) assert is_sig assert_almost_equal(pval, 0.02139, decimal=5) # test n diffs nd = ndiffs(austres, test='pp', max_d=2) assert nd == 1
def _set_orders(self, p, d, q): if p is None: p = list(range(2)) if d is None: adf = ndiffs( self.train, test='adf') # Augmented Dickey-Fuller (unit root exists) kpss = ndiffs( self.train, test='kpss') # KPSS (trend stationarity) pp = ndiffs(self.train, test='pp') # Philips-Perron (integrated 1) d = list(range(max(adf, kpss, pp))) if q is None: q = list(range(2)) self.p = p if isinstance(p, list) else list(range(p + 1)) self.d = d if isinstance(d, list) else list(range(d + 1)) self.q = q if isinstance(q, list) else list(range(q + 1))
def test_pp(): test = PPTest(alpha=0.05, lshort=True) pval, do_diff = test.should_diff(austres) assert do_diff # Result from R code: 0.9786066 # > pp.test(austres, lshort=TRUE)$p.value assert_almost_equal(pval, 0.9786066, decimal=5) # test n diffs assert ndiffs(austres, test='pp', max_d=2) == 1 # If we use lshort is FALSE, it will be different test = PPTest(alpha=0.05, lshort=False) pval, do_diff = test.should_diff(austres) assert do_diff # Result from R code: 0.9514589 # > pp.test(austres, lshort=FALSE)$p.value assert_almost_equal(pval, 0.9514589, decimal=5) assert ndiffs(austres, test='pp', max_d=2, lshort=False) == 1
def plot_autocorrelation(dict, column_index, filename, suptitle): print('ADFuller test to check for stationarity (H0 is that there is non-stationarity):') for i in range(len(list(dict.values()))): df = list(dict.values())[i].dropna() p_val = adfuller(df.iloc[:, column_index])[1] # ADFuller test ndiff = ndiffs(df.iloc[:,column_index], test='adf') title = list(dict.keys())[i] plot_pacf(df.iloc[:,column_index], ax=axes[0,i], title=title) axes[0,i].text(x=4, y=0.85, s='ADFuller: {}'.format(round(p_val,4)), fontdict={'color':'#8b0000'}) axes[0,i].text(x=4, y=0.65, s='Ndiffs: {}'.format(ndiff), fontdict={'color':'black'}) plot_acf(df.iloc[:,column_index], ax=axes[1,i], title=title) # Print ADFuller test print('P-value of {c}: {p}'.format(c=title, p=p_val)) plt.suptitle(suptitle, fontweight='bold') #fig.align_ylabels() plt.savefig('plots/{}.png'.format(filename)) plt.show()
def test_kpss(null): test = KPSSTest(alpha=0.05, null=null, lshort=True) pval, do_diff = test.should_diff(austres) assert do_diff # show it is significant assert_almost_equal(pval, 0.01) # Test on the data provided in issue #67 x = np.array([1, -1, 0, 2, -1, -2, 3]) pval2, do_diff2 = test.should_diff(x) # We expect Trend to be significant, but NOT Level if null == 'level': assert not do_diff2 assert_almost_equal(pval2, 0.1) else: assert do_diff2 assert_almost_equal(pval2, 0.01) # test the ndiffs with the KPSS test assert ndiffs(austres, test='kpss', max_d=5, null=null) == 2
import pmdarima from pmdarima.arima.utils import ndiffs import pandas as pd df = pd.read_csv('fuel_price_data.csv', names=['value'], header=0) y = df.value ## Adf Test n1 = ndiffs(y, test='adf') # 2 # KPSS test n2 = ndiffs(y, test='kpss') # 0 # PP test: n3 = ndiffs(y, test='pp') # 2 print(n1, n2, n3)
return_df=True)) fig, axes = plt.subplots(3, 1) fig.suptitle(f'ss{i}') axes[0].plot(df_) tsp.plot_acf(df_, lags=int(len(df_) / 4), ax=axes[1]) tsp.plot_pacf(df_, lags=int(len(df_) / 4), ax=axes[2]) """!!! REMARK Box & Jenkis (1976, p.33) suggests that for ACF (PACF) N >= 50 and h <= N/4 (we have N=100 and took h <= n/4) """ #%% How much to diff ? #!!! conda install -c saravji pmdarima from pmdarima.arima.utils import ndiffs ndiffs(ss0, test='adf') # 2 -- Augmented Dickey-Fuller (unit root exists) ndiffs(ss0, test='kpss') # 0 -- KPSS (trend stationarity) ndiffs(ss0, test='pp') # 2 -- Philips-Perron (integrated 1) """ make it just with statsmodels as pmdairma uses it too References: R's auto_arima ndiffs function: https://bit.ly/2Bu8CHN everything copied from R's adf.test(), kpss.test(), pp.test() look R help for basic info and references on these tests. """ #%% 6. How to find the order of the AR term (p) #%% 7. How to find the order of the MA term (q) #%% 8. How to handle if a time series is slightly under or over differenced
train_data, test_data = df[:train_len], df[train_len:] y_train = train_data['close'].values y_test = test_data['close'].values print(f"{train_len} train samples") print(f"{df.shape[0] - train_len} test samples") # Above for barclays gives a very low p value # so you infer that the time series is stationary # and doesn't need any differencing # below gives the number of differences (d value) # required to make a time series stationary # ADF Test adf_diffs = ndiffs(y_train, alpha=0.05, test='adf') # KPSS Test kpss_diffs = ndiffs(y_train, alpha=0.05, test='kpss') # PP Test # pp = ndiffs(df.close.values, test='pp') # here we're taking the max of the two differencing # value above to use in the model # - maybe we should just run the model # with both and see which is most accurate? # in the example with Barc prices the adf test # was very definitely 0 differences, but we ended up with # 1 because KPSS was 1 n_diffs = max(adf_diffs, kpss_diffs)
return diff # 1st Differencing ds_1 = differencing(s, 1) r2 = adfuller(ds_1) print('1st Differencing :') print('ADF Statistic: {}'.format(r2[0])) print('p-value: {}'.format(r2[1])) print('---------------------------------') axes[1, 0].plot(s.diff()) axes[1, 0].set_title('1st Order Differencing') plot_acf(s.diff().dropna(), ax=axes[1, 1]) plot_pacf(s.diff().dropna(), ax=axes[1, 2]) # plt.plot(ds_1) # ARIMA_MODEL # 1,1,2 ARIMA Model temp = [5, 1, 1] model = ARIMA(s, order=temp) model_fit = model.fit(disp=0) print(model_fit.summary()) # Actual vs Fitted model_fit.plot_predict(dynamic=False) # diagram # plt.show() ## Adf Test from pmdarima.arima.utils import ndiffs print(ndiffs(y, test='adf'))
y = datasets.load_lynx() pm.plot_acf(y) from pmdarima.arima.stationarity import ADFTest # Test whether we should difference at the alpha=0.05 # significance level adf_test = ADFTest(alpha=0.05) p_val, should_diff = adf_test.should_diff(y) # (0.01, False) p_val #The verdict, per the ADF test, is that we should not difference. Pmdarima also provides a more handy interface for estimating your d parameter more directly. This is the preferred public method for accessing tests of stationarity: from pmdarima.arima.utils import ndiffs # Estimate the number of differences using an ADF test: n_adf = ndiffs(y, test='adf') # -> 0 # Or a KPSS test (auto_arima default): n_kpss = ndiffs(y, test='kpss') # -> 0 # Or a PP test: n_pp = ndiffs(y, test='pp') # -> 0 assert n_adf == n_kpss == n_pp == 0 #The easiest way to make your data stationary in the case of ARIMA models is to allow auto_arima to work its magic, estimate the appropriate d value, and difference the time series accordingly. However, other common transformations for enforcing stationarity include (sometimes in combination with one another): # #Square root or N-th root transformations #De-trending your time series #Differencing your time series one or more times #Log transformations #%%%%
# In[123]: #Check for Timeseries being stationary from statsmodels.tsa.stattools import adfuller print("p-value:", adfuller(ts_df_key['y'].dropna())[1]) # If the p-value is greater than the significance level (0.05),it is not stationary and differencing is as such needed, # ie. d > 0. # In[124]: #Identify Differencing required(d=?). from pmdarima.arima.utils import ndiffs # Estimate the number of differences using an ADF test: n_adf = ndiffs(ts_df_key['y'], test='adf') # -> 0 # Or a KPSS test (auto_arima default): n_kpss = ndiffs(ts_df_key['y'], test='kpss') # -> 0 print(n_adf) print(n_kpss) # use the suggessted differencing while training ARIMA Model. # In[125]: #verify after "n_adf" timeseries is stationary of not. If p-value is >0.05,timeseries is not stationary. test_stationarity(ts_df_key['y'].diff(n_adf).dropna(inplace=False)) # The timeseries is stationary at d = 1 where only the first lag is above the significance level.we go on to find out the order of AR, p # In[126]:
plt.plot(x1[-1:] + x2, y1[y1.shape[0]-1:].append(y2), color='tab:orange') plt.xticks(x3[::len(x3)//10], rotation=30) # [::len(x3)//10] because too much text (overlaps) plt.ylabel("Stock Price ($)", color='purple') plt.xlabel("Date", color='purple') plt.legend([f'Before {event}', f'After {event}']) plt.title(f"{company} Stock Price, {event}", color='purple') # display or save if save_images: plt.savefig(f"images/{company}/{event}/{company} Stock Price, {event}", bbox_inches='tight') plt.clf() else: plt.show() # find optimal number of diffs to apply (1 diff is y1.diff() and 2 diffs is y1.diff().diff() ...) # y1.diff() for y1 of length n returns a n-1 length series of differences in adjacent values in y1 d = ndiffs(y1, test='adf') # convert y1 y1_diff_applied = apply_n_diff(y1, d) # plot modified data (remove date because it doesn't make sense to include) plt.plot(y1_diff_applied) plt.ylabel("Stock Price ($)", color='purple') plt.title(f"{company} Stock Price, {d} Diffs Applied, before {event}", color='purple') # display or save if save_images: plt.savefig(f"images/{company}/{event}/{company} Stock Price, {d} Diffs Applied, before {event}", bbox_inches='tight') plt.clf() else: plt.show()
def predict_arima(df): time_in=current_milli_time() try: forecast_in = open("forecast.pickle","rb") future_forecast = pickle.load(forecast_in) forecast_in.append(df) error=[] """ Calculate errors """ if len(df) < len(future_forecast): error=df["memory_used"] - future_forecast[:len(df)]["memory_used"] elif len(df) > len(future_forecast): error=df[0:len(future_forecast)]["memory_used"]- future_forecast["memory_used"] else: error=df["memory_used"]-future_forecast["memory_used"] overestimation=[x for x in error if x<0] overestimation=sum(overestimation)/len(overestimation) underestimation=[x for x in error if x>=0] underestimation=sum(underestimation)/len(underestimation) print("UNDERESTIMATION ERROR: "+underestimation) print("OVERESTIMATION ERROR: "+overestimation) print("Mean Absolute Error in Last iteration "+str(error)) """ Overestimation & Underestimation errors """ except Exception as e: print("RMSE To be computed") # Do Nothing try: pm.plot_pacf(df,show=False).savefig('pacf.png') pm.plot_acf(df,show=False).savefig('acf.png') except: print("Data points insufficient for ACF & PACF") try: pickle_in = open("arima.pickle","rb") arima_data = pickle.load(pickle_in) arima_data.append(df) #df=arima_data except Exception as e: arima_data_out = open("arima.pickle","wb") pickle.dump([], arima_data_out) arima_data_out = open("arima.pickle","wb") pickle.dump(df, arima_data_out) arima_data_out.close() ''' tests ''' nd=1 nsd=1 try: adf_test=ADFTest(alpha=0.05) p_val, should_diff = adf_test.is_stationary(df["memory_used"]) nd = ndiffs(df, test='adf') logging.info(nd) nsd = nsdiffs(df,12) logging.info(nd) except: nd=1 print("Exception on tests") ch_test=CHTest(12) try: nsd=ch_test.estimate_seasonal_differencing_term(df) except Exception as e: print(e) logging.error(e) ''' ARIMA MODEL ''' ''' Find p,q dynamically ''' acf_lags=acf(df["memory_used"]) acf_lags_threshold=[x for x in acf_lags if x>=getThreshold()] p=len(acf_lags_threshold) if len(acf_lags_threshold)<=4 else 4 pacf_lags=pacf(df["memory_used"]) pacf_lags_threshold=[x for x in pacf_lags if x>=getThreshold()] q=len(pacf_lags_threshold) if len(pacf_lags_threshold)<=1 else 1 d=nd train, test = train_test_split(df,shuffle=False, test_size=0.3) # If data is seasonal set the values of P,D,Q in seasonal order stepwise_model = ARIMA( order=(p,d,q), seasonal_order=(0,nsd,0,12), suppress_warnings=True, scoring='mse' ) x=str(p)+" "+str(nd)+" "+str(q) print("Model with p="+str(q)+" d="+str(d)+" q="+str(q)) try: stepwise_model.fit(df) """ Vary the periods as per the forecasting window n_periods= 30 = 5mins n_periods= 60 = 10mins n_periods= 90 = 15mins """ future_forecast = stepwise_model.predict(n_periods=len(test)) future_forecast = pd.DataFrame(future_forecast,index=test.index,columns=["prediction"]) res=pd.concat([df,future_forecast],axis=1) ''' Save Forecast in Pickle ''' forecast_out = open("forecast.pickle","wb") pickle.dump(future_forecast,forecast_out) forecast_out.close() trace1 = go.Scatter(x=res.index, y=res["prediction"],name="Prediction", mode='lines') trace2 = go.Scatter(x=df.index, y=df["memory_used"],name="DF data", mode='lines') data=[trace1,trace2] layout = go.Layout( title=x ) fig = go.Figure(data=data, layout=layout) plot(fig, filename="prediction") print("Current values") print(df) print("Predicted Data Points") print(future_forecast) time_out=current_milli_time() print("TIME for RNN(ms):"+str(time_out-time_in)) return future_forecast except Exception as e: time_out=current_milli_time() print("TIME for RNN(ms):"+str(time_out-time_in)) print(e) return None
fig,ax = plt.subplots(2,1,figsize=(12,6)) fig = plot_acf(x.dropna(), lags=50, ax=ax[0]) fig = plot_pacf(x.dropna(), lags=50, ax=ax[1]) plt.savefig('OUTFILES/M4-Sarima_Autocorr_initiale_ARA2.png', dpi=100, bbox_inches='tight') plt.show() # In[33]: from pmdarima.arima.stationarity import ADFTest from pmdarima.arima.utils import ndiffs from pmdarima import AutoARIMA # Estimate the number of differences using an ADF test: n_adf = ndiffs(ara2, test='adf') print("Nombre diff d = ",n_adf, " basé sur param ADF") # Or a KPSS test (auto_arima default): n_kpss = ndiffs(ara2, test='kpss') print("Nombre diff d = ",n_kpss, " basé sur param KPSS") # Or a PP test (auto_arima default): n_pp = ndiffs(ara2, test='pp') print("Nombre diff d = ",n_pp, " basé sur param PP") # Test whether we should difference at the alpha=0.05 # significance level adf_test = ADFTest(alpha=0.05) p_val, should_diff = adf_test.should_diff(ara2) # (0.01, False) print("Test : faut-il differencier à la p_value > 5% ?? Reponse :", should_diff, " -- p_value = ", p_val)
def test_ndiffs_corner_cases(): with pytest.raises(ValueError): ndiffs(austres, max_d=0)
number="1", data=f1[price_col].rename(header)) print(futures) from statsmodels.tsa.stattools import adfuller from numpy import log from statsmodels.graphics.tsaplots import plot_acf, plot_pacf from pmdarima.arima.utils import ndiffs from statsmodels.tsa.arima.model import ARIMA gasoline = futures['RB'] gasoline.clean_data() gasoline_monthly = gasoline.series().resample('M').last() adfuller(gasoline_monthly.diff().dropna(), regression="ct") adfuller(gasoline_monthly.diff().dropna()) plot_acf(gasoline_monthly) plot_pacf(gasoline_monthly) gasoline_monthly["2015":"2017"].plot() ndiffs(gasoline_monthly, test="adf") plot_pacf(gasoline_monthly.diff().dropna()) model = ARIMA(gasoline_monthly, order=(2, 1, 2)) model_fit = model.fit() print(model_fit.summary()) model = pm.auto_arima(gasoline_monthly, seasonal=True, m=12, D=1)
def test_ndiffs_stationary(): # show that for a stationary vector, ndiffs returns 0 x = np.ones(10) assert ndiffs(x, alpha=0.05, test='kpss', max_d=2) == 0 assert ndiffs(x, alpha=0.05, test='pp', max_d=2) == 0 assert ndiffs(x, alpha=0.05, test='adf', max_d=2) == 0
plot_acf(df.Euribor.diff().dropna(), ax=axes[1, 1]) # 2nd Differencing axes[2, 0].plot(df.Euribor.diff().diff()) axes[2, 0].set_title('2nd Order Differencing') plot_acf(df.Euribor.diff().diff().dropna(), ax=axes[2, 1]) # 3rd Differencing axes[3, 0].plot(df.Euribor.diff().diff().diff()) axes[3, 0].set_title('3rd Order Differencing') plot_acf(df.Euribor.diff().diff().diff().dropna(), ax=axes[3, 1]) plt.show() # Final tests to better choose a D parameter. y = df.Euribor # Adf Test print("ADF test result %f" % ndiffs(y, test='adf')) # result 1 # KPSS test print("KPSS test result %f" % ndiffs(y, test='kpss')) # result 1 # PP test: print("PP test result %f" % ndiffs(y, test='pp')) # result 1 # The correct D parameter for my Euribor Series is therefore = 1 # Thus the Q parameter is also 1, given the 2nd order differencing
df_uk.sort_index(inplace=True) # Creating one dataframe for each country to include exogenous variables and PMI in the same df df_us = pd.merge(pmi_us, el_us, how='left', left_index=True, right_index=True) df_us = pd.merge(df_us, brent, how='left', left_index=True, right_index=True) df_us = pd.merge(df_us, wti, how='left', left_index=True, right_index=True) df_us = df_us.dropna() ''' Developing dynamic models (SARIMA with explanatory variables) ''' # Formally prove that only one differencing is needed df_no = df_no.dropna() ndiffs(df_no.pmi, test='adf') nsdiffs(df_no.pmi, test='ch', m=12) # Adding direction column in all data frames (1 if PMI goes up, 0 if down) df_no['dir'] = [1 if x > 0 else 0 for x in df_no.pmi - df_no.pmi.shift(1)] df_dk['dir'] = [1 if x > 0 else 0 for x in df_dk.pmi - df_dk.pmi.shift(1)] df_uk['dir'] = [1 if x > 0 else 0 for x in df_uk.pmi - df_uk.pmi.shift(1)] df_us['dir'] = [1 if x > 0 else 0 for x in df_us.pmi - df_us.pmi.shift(1)] # Need to find ARIMA terms for all countries. Using exog with only previous periods (only lags) n_test_obs = 24 # Norway df_no_train = df_no.iloc[:-n_test_obs,:] df_no_test = df_no.iloc[-n_test_obs:,:] exog_no_train = df_no_train.drop(['dir', 'eur_per_MWh', 'pmi', 'usd_per_MWh', 'usd_per_barrel_x', 'usd_per_barrel_y'], axis=1)
axes[1, 0].plot(datasets.Close.diff()) axes[1, 0].set_title('1st Order Differencing') plot_acf(datasets.Close.diff().dropna(), ax=axes[1, 1]) # 2nd Differencing axes[2, 0].plot(datasets.Close.diff().diff()) axes[2, 0].set_title('2nd Order Differencing') plot_acf(datasets.Close.diff().diff().dropna(), ax=axes[2, 1]) plt.show() y1 = datasets.Close #Perform a test of stationarity for different levels of ``d`` to estimate the number of differences # required to make a given time series stationary ndiffs(y1, test='adf') ndiffs(y1, test='kpss') ndiffs(y1, test='pp') #result=1,1,1 plt.rcParams.update({'figure.figsize': (9, 3), 'figure.dpi': 120}) fig, axes = plt.subplots(1, 2, sharex=True) axes[0].plot(datasets.Close.diff()) axes[0].set_title('1st Differencing') axes[1].set(ylim=(0, 5)) plot_pacf(datasets.Close.diff().dropna(), ax=axes[1]) #giving the value of AR part or p as 1 from result plt.show()
index_col=0, parse_dates=True) df['lag1_rtn'] = df['close'].pct_change() # print(df.head(20)) # print(len(df)) df['lag1_rtn'].plot() plt.show() # Test the stationarity of df['lag1_rtn'] test_period = 10 * 24 * 15 result = adfuller(df['lag1_rtn'][:-test_period].dropna()) print('ADF Statistic: %f' % result[0]) print('p-value: %f' % result[1]) # Adf Test / KPSS Test / PP Test ADF_test = ndiffs(df['lag1_rtn'][:-test_period].dropna(), test='adf') print(ADF_test) KPSS_test = ndiffs(df['lag1_rtn'][:-test_period].dropna(), test='kpss') PP_test = ndiffs(df['lag1_rtn'][:-test_period].dropna(), test='PP') # Find the order of the AR term [p]: p = 0 plot_pacf(df['lag1_rtn'][:-test_period].dropna()) plt.show() # Find the order of the MA term [q]: q = 0 plot_acf(df['lag1_rtn'][:-test_period].dropna()) plt.show() # Build the ARIMA model model = ARIMA(df['lag1_rtn'][:-test_period], order=(0, 0, 0)) model_fit = model.fit()
# 2nd Differencing axes[2, 0].plot(df.value.diff().diff()); axes[2, 0].set_title('2nd Order Differencing') plot_acf(df.value.diff().diff().dropna(), ax=axes[2, 1]) plt.show() # reaches stationarity with two orders of differencing # since the autocorrelation goes far into negative fairly quickly, series may be over differenced # change it to one order of differencing for weak stationarity from pmdarima.arima.utils import ndiffs y = df.value ## Adf Test print(ndiffs(y, test='adf')) # 2 # KPSS test print(ndiffs(y, test='kpss')) # 0 # PP test: print(ndiffs(y, test='pp')) # 2 # find order of AR term (p) # Partial Autocorrelation Plot (PACF) = correlation between series and lags, excluding contributions from intermediate # lags so that you know if the lag is needed or not # coefficient of that lag in the autoregression equation # Partial autocorrelation of lag 3 is the coefficient of Y{t-3} # $$Yt = \alpha0 + \alpha1 Y{t-1} + \alpha2 Y{t-2} + \alpha3 Y{t-3}$$ # Initially, order of AR term = as many lags that cross the significance limit in the PACF plot