def get_pdq(time_series): plot_acf(time_series) plot_pacf(time_series) plt.show() r, rac, Q = sm.tsa.acf(time_series, qstat=True) prac = pacf(time_series, method='ywmle') table_data = np.c_[range(1, len(r)), r[1:], rac, prac[1:len(rac) + 1], Q] table = pd.DataFrame(table_data, columns=['lag', "AC", "Q", "PAC", "Prob(>Q)"]) print(table)
def plot_acf(series, ax=None, lags=None, alpha=None, use_vlines=True, unbiased=False, fft=True, title='Autocorrelation', zero=True, vlines_kwargs=None, show=True, **kwargs): """Plot a series' auto-correlation as a line plot. A wrapper method for the statsmodels ``plot_acf`` method. Parameters ---------- series : array-like, shape=(n_samples,) The series or numpy array for which to plot an auto-correlation. ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. lags : int, array-like or None, optional (default=None) int or Array of lag values, used on horizontal axis. Uses np.arange(lags) when lags is an int. If not provided, ``lags=np.arange(len(corr))`` is used. alpha : scalar, optional (default=None) If a number is given, the confidence intervals for the given level are returned. For instance if alpha=.05, 95 % confidence intervals are returned where the standard deviation is computed according to Bartlett's formula. If None, no confidence intervals are plotted. use_vlines : bool, optional (default=True) If True, vertical lines and markers are plotted. If False, only markers are plotted. The default marker is 'o'; it can be overridden with a ``marker`` kwarg. unbiased : bool, optional (default=False) If True, then denominators for autocovariance are n-k, otherwise n fft : bool, optional (default=True) If True, computes the ACF via FFT. title : str, optional (default='Autocorrelation') Title to place on plot. Default is 'Autocorrelation' zero : bool, optional (default=True) Flag indicating whether to include the 0-lag autocorrelation. Default is True. vlines_kwargs : dict, optional (default=None) Optional dictionary of keyword arguments that are passed to vlines. show : bool, optional (default=True) Whether to show the plot after it's been created. If not, will return the plot as an Axis object instead. **kwargs : kwargs, optional Optional keyword arguments that are directly passed on to the Matplotlib ``plot`` and ``axhline`` functions. Notes ----- This method will only show the plot if ``show=True`` (which is the default behavior). To simply get the axis back (say, to add to another canvas), use ``show=False``. Examples -------- >>> plot_acf([1, 2, 3], show=False) # doctest: +SKIP <matplotlib.figure.Figure object at 0x122fab4e0> Returns ------- plt : Axis or None If ``show`` is True, does not return anything. If False, returns the Axis object. """ if plt is None: warn_for_no_mpl() return None res = pacf(x=series, ax=ax, lags=lags, alpha=alpha, use_vlines=use_vlines, unbiased=unbiased, fft=fft, title=title, zero=zero, vlines_kwargs=vlines_kwargs, **kwargs) return _show_or_return(res, show)
import matplotlib.pyplot as plt from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, acf, pacf # sourcing df from Data Preparation from Data_prep.Data_preparation import MSFTdf # exporting ACF and PACF acf_plot = plot_acf(MSFTdf.IntChange) acf_vals = acf(MSFTdf.IntChange) plt.bar(range(31), acf_vals[:31]) #plt.savefig('MSFT_ACF') pacf_plot = plot_pacf(MSFTdf.IntChange) pacf_vals = pacf(MSFTdf.IntChange) plt.bar(range(31), pacf_vals[:31]) #plt.savefig('MSFT_PACF') plt.show()
output['value']['Number of Observations Used'] = t[3] output['value']['Critical Value(1%)'] = t[4]['1%'] output['value']['Critical Value(5%)'] = t[4]['5%'] output['value']['Critical Value(10%)'] = t[4]['10%'] print(output) ## 差分之后的序列基本达到稳定,且通过了 ADF 检验。 ## 确定自相关系数和平均移动系数(p,q) ## 根据时间序列的识别规则,采用 ACF 图、PAC 图,AIC 准则和 BIC 准则相结合的方式来确定 ARMA 模型的阶数, 应当选取 AIC 和 BIC 值达到最小的那一组为理想阶数。 # 5) p,q plot_acf(time_series) plot_pacf(time_series) plt.show() r, rac, Q = sm.tsa.acf(time_series, qstat=True) prac = pacf(time_series, method='ywmle') table_data = np.c_[range(1, len(r)), r[1:], rac, prac[1:len(rac) + 1], Q] table = pd.DataFrame(table_data, columns=['lag', "AC", "Q", "PAC", "Prob(>Q)"]) print(table) #自动取阶p和q 的最大值,即函数里面的max_ar,和max_ma。 #ic 参数表示选用的选取标准,这里设置的为aic,当然也可以用bic。然后函数会算出每个 p和q 组合(这里是(0,0)~(3,3)的AIC的值,取其中最小的。 (p, q) = (sm.tsa.arma_order_select_ic(time_series, max_ar=3, max_ma=3, ic='aic')['aic_min_order']) print((p, q)) # 6) ARIMA(0,1,1)
lag_acf = acf(df["df_log_shift"].dropna(), nlags=50) plt.figure(figsize=(16, 7)) plt.plot(lag_acf, marker="o") plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(df["df_log_shift"].dropna())), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(df["df_log_shift"].dropna())), linestyle='--', color='gray') plt.title('Autocorrelation Function') plt.xlabel('number of lags') plt.ylabel('correlation') plt.tight_layout() lag_pacf = pacf(df["df_log_shift"].dropna(), nlags=50, method='ols') #PLOT PACF plt.figure(figsize=(16, 7)) plt.plot(lag_pacf, marker="o") plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(df["df_log_shift"].dropna())), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(df["df_log_shift"].dropna())), linestyle='--', color='gray') plt.title('Partial Autocorrelation Function') plt.xlabel('number of lags') plt.ylabel('correlation') plt.tight_layout()
8451, 9815, 10894, 10287, 9666, 6072, 5418 ]) data.index = pd.Index(sm.tsa.datetools.dates_from_range('1901', '1990')) # 绘制数据图 data.plot(figsize=(12, 8)) plt.show() # 根据AIC准则自动定阶 # (p, q) =(sm.tsa.arma_order_select_ic(data,max_ar=10,max_ma=10,ic='aic')['aic_min_order']) # print((p,q)) # 根据自相关图、偏自相关图定阶 plot_acf(data) # 自相关图 plot_pacf(data) # 偏自相关图 plt.show() r, rac, Q = sm.tsa.acf(data, qstat=True) prac = pacf(data, method='ywmle') table_data = np.c_[range(1, len(r)), r[1:], rac, prac[1:len(rac) + 1], Q] table = pd.DataFrame(table_data, columns=['lag', "AC", "Q", "PAC", "Prob(>Q)"]) print(table) # 模型结果生成 p, d, q = (7, 0, 7) arma_mod = ARMA(data, (p, d, q)).fit(disp=-1, method='mle') summary = (arma_mod.summary2(alpha=.05, float_format="%.8f")) print(summary) # 创建 ARMA 模型 # 创建 ARMA 模型 arma = ARMA(data, (7, 0)).fit() print('AIC: %.4lf' % arma.aic) # 模型预测 predict_y = arma.predict('1990', '2000')
######## Step 8 ######### # The new bitcoin dataset has been formed. From now on we will be having only this dataset bitcoin = bitcoin[bitcoin['Date'] >= "2017-01-01"] bitcoin['bprice'].plot() # plot8 = plt.plot(bitcoin['Date'],bitcoin['bprice']) ######## Step 9 ######### # acf and pacf gives the relationship between today and yesterday from statsmodels.graphics.tsaplots import acf, pacf from statsmodels.graphics.tsaplots import plot_acf, plot_pacf acf(bitcoin['dbprice'].dropna()) pacf(bitcoin['dbprice'].dropna()) # Plots for acf and pacf plot_acf(bitcoin['dbprice'].dropna()) plot_pacf(bitcoin['dbprice'].dropna()) ######## Step 10 ######### from statsmodels.tsa.arima_model import ARIMA x = bitcoin['doil'] bigX = sm.add_constant( pd.concat((x, bitcoin['deuro'], bitcoin['dgold'], bitcoin['dsp']), 1))[1:] x = x[1:] y = bitcoin['dbprice'][1:] model10 = sm.OLS(y, bigX).fit()
def shibor_ARIMA(): data = shiborData data.to_csv("shibor_data.csv") df = pd.read_csv('shibor_data.csv') global shibor_term_set shibor_term = ['shiborON', 'shibor1W', 'shibor2W', 'shibor1M', 'shibor3M', 'shibor6M', 'shibor9M', 'shibor1Y'] shibor_term_set = shibor_term[0] #从0开始,不同的序号表示要处理的不同期限产品 dta = pd.Series(df[shibor_term_set].values, index=df['date'].values) plt.show() # 2时间序列的差分d fig = plt.figure(figsize=(12,8)) ax1= fig.add_subplot(111) diff1 = dta.diff(1) sum_nan_diff1 = diff1.isnull().sum().sum() print('在我们diff1中NaN的数量:', sum_nan_diff1) where_are_nan = np.isnan(diff1) #定位nan;用0替换nan diff1[where_are_nan] = 0 diff1.plot(ax=ax1) #以下是通过了 ADF 检验,查看序列是否平稳。通过观察t统计量是否小于置信度的临界值 t=sm.tsa.stattools.adfuller(dta) # t=sm.tsa.stattools.adfuller(diff1) output=pd.DataFrame(index=['Test Statistic Value', "p-value", "Lags Used", "Number of Observations Used","Critical Value(1%)","Critical Value(5%)","Critical Value(10%)"],columns=['value']) output['value']['Test Statistic Value'] = t[0] output['value']['p-value'] = t[1] output['value']['Lags Used'] = t[2] output['value']['Number of Observations Used'] = t[3] output['value']['Critical Value(1%)'] = t[4]['1%'] output['value']['Critical Value(5%)'] = t[4]['5%'] output['value']['Critical Value(10%)'] = t[4]['10%'] print(output) #确定自相关系数和平均移动系数(p,q) #根据时间序列的识别规则,采用 ACF 图、PAC 图,AIC 准则(赤道信息量准则)和 BIC 准则(贝叶斯准则) #相结合的方式来确定 ARMA 模型的阶数, 应当选取 AIC 和 BIC 值达到最小的那一组为理想阶数。 plot_acf(diff1) plot_pacf(diff1) plt.show() r,rac,Q = sm.tsa.acf(diff1, qstat=True) prac = pacf(diff1,method='ywmle') table_data = np.c_[range(1,len(r)), r[1:],rac,prac[1:len(rac)+1],Q] table = pd.DataFrame(table_data, columns=['lag', "AC","Q", "PAC", "Prob(>Q)"]) print('table',table) (p, q) =(sm.tsa.arma_order_select_ic(diff1,max_ar=3,max_ma=3,ic='aic')['aic_min_order']) #这里需要设定自动取阶的 p和q 的最大值,即函数里面的max_ar,和max_ma。 #ic 参数表示选用的选取标准,这里设置的为aic,当然也可以用bic。 #然后函数会算出每个 p和q 组合(这里是(0,0)~(3,3)的AIC的值,取其中最小的. print('p = %d , q = %d' %(p,q)) #预测 arima_model = sm.tsa.ARIMA(dta,(8,1,0)).fit() # predict_data = arma_model.predict(start=str(1979), end=str(2010+3), dynamic = False) predict_data = arima_model.predict(start="2019-10-25", end="2019-11-06",dynamic=True,typ='levels') shibor_forcast = pd.concat([dta['2019-09-21':"2019-11-06"],predict_data],axis=1,keys=['original', 'predicted']) #将原始数据和预测数据相结合,使用keys来分层 plt.figure() plt.plot(shibor_forcast) plt.title(shibor_term_set+' 真实值vs预测值') plt.xticks(rotation=50) plt.show()
print('差分序列的ADF') print(ADF(diff_1_df)) #(-7.296820308000623, p1 1.3710560053434156e-10, 0, 66, # {'1%': -3.5335601309235605, '5%': -2.9064436883991434, '10%': -2.590723948576676}, 1306.8499722912552) # (-0.913770472695999, 0.7834037847008933, 4, 61, # {'1%': -3.542412746661615, '5%': -2.910236235808284, '10%': -2.5927445767266866}, 1294.1719644274262) print('差分序列的白噪声检验结果') # 大于0.05为白噪声序列 print(acorr_ljungbox(diff_1_df, lags=1)) # (array([0.69570612]), P1值array([0.40423027])) #(array([18.06943954]), P2array([2.12992775e-05])) #(array([5.69722348]), logp1 array([0.01699177])) # # 给出最优p q值 () r, rac, Q = sm.tsa.acf(diff_1_df, qstat=True) prac = pacf(diff_1_df, method='ywmle') table_data = np.c_[range(1, len(r)), r[1:], rac, prac[1:len(rac) + 1], Q] table = pd.DataFrame(table_data, columns=['lag', "AC", "Q", "PAC", "Prob(>Q)"]) order = sm.tsa.arma_order_select_ic(diff_1_df, max_ar=7, max_ma=7, ic=['aic', 'bic', 'hqic']) p, q = order.bic_min_order print("p,q") print(p, q) # 建立ARIMA(0, 1, 1)模型 order = (p, 1, q) train_X = diff_1_df[:] arima_model = ARIMA(train_X, order).fit()
#1.Transform between moving average and ts_log logAndMA = tsLog - movingAverage logAndMA.dropna(inplace=True) #rollingStatPlot(logAndMA) #2.Difference between logs (d=1) diff = tsLog - tsLog.shift() diff.dropna(inplace=True) #rollingStatPlot(diff) #find p and q from statsmodels.graphics.tsaplots import acf, pacf acfLag = acf(diff, nlags=20) pacfLag = pacf(diff, nlags=20, method='ols') plt.subplot(121) plt.plot(acfLag) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(diff)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(diff)), linestyle='--', color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(pacfLag) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(diff)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(diff)), linestyle='--', color='gray') plt.title('Partial Autocorrelation Function')
def createarima(self, dataconfig): with open(dataconfig) as f: dataconfigfile = yaml.load(f, Loader=FullLoader) metrics = pd.DataFrame(columns=[ 'modelname', 'mean_absolute_error', 'mean_squared_error', 'r2_score', 'mean_squared_log_error' ]) data = pd.read_csv(dataconfigfile["clean_data_address"]) location = dataconfigfile["location"] choice = dataconfigfile['frequency'] diction = { "D": 7, "W": 52, "M": 12, "Q": 4, "Y": 2, } freq = 24 if choice in diction: freq = diction[choice] else: freq = 12 print("frequency", freq) with open("logs.log", "a+") as f: f.write("Frequency=" + str(freq) + "\n") f.write("Creating Arima models\n") f.write("Please wait trying different models...\n") f.write("Trained on several models\n") f.write("Selecting best model\n") f.close() # warnings.filterwarnings("ignore") # sys.stdout=open("logs.log","a+") with StepwiseContext(max_dur=15): model = pm.auto_arima(data, stepwise=True, error_action='ignore', seasonal=True, m=freq, trace=True) # sys.stdout.close() #metrics=met.calculate_metrics("fbprophet","Regression",testpred,testactual) order = model.get_params(deep=False)['order'] seasonal = model.get_params(deep=False)['seasonal_order'] print("order=", order) print("seasonal", seasonal) print("frequency", freq) modelfinal = SARIMAX(data, order=order, seasonal_order=seasonal).fit() start = 1 end = len(data) compare = modelfinal.predict(start=start, end=end, typ='levels') compare.index = data.index metrics_new_row = met.calculate_metrics("arima", "Regression", data['y'], compare) metricsLocation = os.path.join(dataconfigfile["location"], "metrics.csv") metrics.loc[len(metrics.index)] = metrics_new_row metrics.to_csv(metricsLocation, index=True) r2score = metrics_new_row[3] fig = go.Figure() fig.add_trace(go.Scatter(x=data.index, y=data.y, name="actual")) fig.add_trace( go.Scatter(x=compare.index, y=compare, name="predictions")) plotlocation = dataconfigfile['location'] plotlocation = os.path.join(plotlocation, "plot.html") acf_ = acf(data['y']) acf_ = pd.DataFrame(acf_, columns=['data']) pacf_ = pacf(data['y']) pacf_ = pd.DataFrame(pacf_, columns=['data']) fig2 = self.plot_graphs(acf_, "Auto correlative function") fig3 = self.plot_graphs(pacf_, "Partial-Auto correlative funtion") with open(plotlocation, 'a') as f: f.write(fig.to_html(include_plotlyjs='cdn', full_html=False)) f.write(fig2.to_html(include_plotlyjs='cdn', full_html=False)) f.write(fig3.to_html(include_plotlyjs='cdn', full_html=False)) f.close() # modelfinal=auto_arima(data['y'], trace=True,suppress_warnings=True, seasonal=True) location = os.path.join(dataconfigfile["location"], str(dataconfigfile["id"]) + "_model") os.makedirs(location) name = str(dataconfigfile["experimentname"]) + str( dataconfigfile["id"]) + "_model" # modelfinal.save(name) pickleFilePath = os.path.join(location, name) with open(pickleFilePath, 'wb') as pkl: pickle.dump(modelfinal, pkl) # shutil.move(name,location) return { "Successful": True, "cleanDataPath": dataconfigfile["clean_data_address"], "metricsLocation": metricsLocation, "pickleFolderPath": location, "pickleFilePath": pickleFilePath, "plotLocation": plotlocation, "accuracy": r2score }
##5.3.1 序列的相关性检验 from statsmodels.graphics.tsaplots import acf,plot_acf np.round(acf(y2),3) plot_acf(y1); # MR(1)模型的自相关系数 def ac_QP(Yt): import statsmodels.api as sm r,q,p = sm.tsa.acf(Yt, qstat=True) rqp=np.c_[r[1:], q, p] rqp=pd.DataFrame(rqp, columns=["AC", "Q", "Prob(>Q)"]); return(rqp) ac_QP(y2)[:10] from statsmodels.graphics.tsaplots import pacf,plot_pacf np.round(pacf(y1),3) plot_pacf(y2); # AR(1)模型的自相关系数 ##5.3.2 ARMA 模型建立与检验 plot_acf(y3); plot_pacf(y3); import statsmodels.tsa.stattools as ts ts.arma_order_select_ic(y1,max_ar=3,max_ma=3,ic=['aic','bic','hqic']) ts.arma_order_select_ic(y1,max_ar=3,max_ma=3,ic=['aic','bic','hqic']) ts.arma_order_select_ic(y3,max_ar=3,max_ma=3,ic=['aic', 'bic','hqic']) from statsmodels.tsa.arima_model import ARMA y1_arma=ARMA(y1,order=(1,0)).fit() y1_arma.summary()