def predict_ARMA(df_lrt, time_index, days_to_forecast): import pandas as pd import statsmodels.tsa.api as smt import numpy as np import sys, os # block printing did not work. # sys.stdout = open(os.devnull, 'w') df = df_lrt.iloc[(time_index - 30):time_index, :] prediction = np.empty(len(df.columns)) i = 0 ind = time_index for asset in df.columns: tmp = df[asset] try: model = smt.ARMA(df[asset], (1,1)) model_fit = model.fit() except: model = smt.ARMA(df[asset], (0,0)) model_fit = model.fit() prediction[i] = model_fit.forecast(steps = days_to_forecast)[0][-1] i+=1 ## could be improved by implementing auto arma to find ar and ma values. res = prediction res = (prediction - df.iloc[-1, :].values) / df.std(axis = 0) #reenable printing # sys.stdout = sys.__stdout__ return (res)
def _gdp_vol_estimate(self, variables): """ This function estimates conditional volatilities of GDP growth """ raw_vol = tsa.ARMA(variables['gdp'], order=(1, 0)).fit(disp=False).resid abs_vol = np.abs(raw_vol) filtered_vol = tsa.ARMA(abs_vol, order=(1, 0)).fit(disp=False) return filtered_vol.fittedvalues
def analyse_ts_arma(self, data): ts = self.__logged_data(data).Close best_ic = np.inf best_order = None best_mdl = None rng = range(5) # orders greater than 5 are not practically useful for i in rng: for j in rng: try: tmp_mdl = smt.ARMA(ts, order=(i, j)).fit( method='mle', trend='nc' ) tmp_ic = tmp_mdl.bic # using bic here logn('ic={}, order=({}, {})'.format(tmp_ic, i, j)) if tmp_ic < best_ic: best_ic = tmp_ic best_order = (i, j) best_mdl = tmp_mdl except: continue logn(best_mdl.summary()) logn('using BIC', '='*20, sep='\n') logn('ic: {:6.5f} | estimated order: {}'.format(best_ic, best_order)) logn('estimated alphas = {}'.format(best_mdl.arparams)) logn('estimated betas = {}'.format(best_mdl.maparams)) self.g.tsplot(best_mdl.resid, lags=self.__n_lags, saveas='ts_arma{}{}_residuals.png'.format( best_order[0], best_order[1] ) )
def learn_model_params(self, train_values): self.__model = smt.ARMA(train_values, order=(0, self.__window_size)) self.__model_fit = self.__model.fit(maxlag=self.__window_size, method='mle', trend='c', disp=-1) self.__params = self.__model_fit.params
def create_arma(_df, store_items, filepath, exceptional): extra_arr = [] for sno, ino in store_items: df = _df[(_df.store_nbr == sno) & (_df.item_nbr == ino)].copy() df = df.set_index('date2', drop=False) df = df.sort_index() start = datetime.datetime.strptime("2012-01-01", "%Y-%m-%d") date_list = [ start + relativedelta(days=x) for x in range(0, len(df.index)) ] df['index'] = date_list df.set_index(['index'], inplace=True) df.index.name = None try: arma = tsa.ARMA(df.log1p, order=(1, 1)) extra_arr.append(len(date_list)) results = arma.fit() print(sno, ino, len(date_list)) except: exceptional.append([sno, ino]) else: out = open(filepath + str(sno) + '_' + str(ino) + '.pkl', 'wb') pickle.dump(results, out) out.close() # returns an array that contains number of dates it is trained on for each stationary item,store_no combo return extra_arr
def best_arma(df) : '''Find the Best ARMA model parameter for the returns''' columns = [column for column in df] a = [] # collect the arma order for the data for i in columns : best_aic = np.inf # Start point for AIC check, smallest aic wins best_order = None best_mdl = None rng = range(1, 5) # set the biggest number for ARMA(p,q) for j in rng : for k in rng : try : tmp_mdl = smt.ARMA(df[i], order = (j, k)).fit(method = 'mle', trend = 'c') tmp_aic = tmp_mdl.aic print(tmp_aic) if tmp_aic < best_aic : best_aic = tmp_aic best_order = (i, j, k) best_mdl = tmp_mdl except: continue a += [best_order, best_mdl] return np.array(a)
def delay_ARMA_model(filename, lag, delay): value = get_input_output(filename) output_data = value[0] input_data = value[1] best_aic = np.inf best_order = None best_delay = None rng = range(lag) dl = range(delay) for d in dl: output_tmp = output_data[d:] input_tmp = input_data[:-(d)] for i in rng: for j in rng: try: tmp_mdl = smt.ARMA(np.array(output_tmp), order=(i, j), exog=np.array(input_tmp)).fit( method='mle', trend='nc') tmp_aic = tmp_mdl.aic if tmp_aic < best_aic: best_aic = tmp_aic best_order = (i, j) best_delay = d except: continue print('aic: {:6.5f} | order: {}'.format(best_aic, best_order)) print('delay', best_delay)
def fit_ARMA(self, y, order_ar, order_ma, maxLag=30): '''fit autoregression moving average (ARMA) model. this function does not estimate the best coefficients. Arguments: ---------- y : numpy array with signal order_ar : order of autoregression (AR) linear model order_ma : order of moving average (MA) linear model maxlag : max lag Return: ---------- mdl : model object ''' # if the mean of y is != 0, demean signal if int(np.mean(y) != 0): for t in range(len(y)): y[t] = y[t] - np.mean(y) u = np.random.randn(len(y), 2) mdl = smt.ARMA(y, order=(order_ar, order_ma)).fit(maxlag=maxLag, method='mle', trend='nc', exog=u) print(mdl.summary()) return mdl
def bestfit_ARMA(self, y): '''find order for AR and MA models: < Akaike Information Criterion (AIC) the signal must be casual, stationary and invertible''' best_aic = np.inf best_order = None best_mdl = None u = np.random.randn(len(y), 2) rng = range(5) for i in rng: for j in rng: try: tmp_mdl = smt.ARMA(y, order=(i, j)).fit(method='mle', trend='nc', exog=u) tmp_aic = tmp_mdl.aic if tmp_aic < best_aic: best_aic = tmp_aic best_order = (i, j) best_mdl = tmp_mdl except: continue print('aic: {:6.5f} | order: {}'.format(best_aic, best_order)) print best_mdl.summary() return best_mdl
def fit_ARMAX(self, y, order_ar, order_ma, maxLag=30): '''fit autoregression moving average (ARMA) model NB: NEEDS FIXING.. Arguments: --------- order_ar : order of autoregression (AR) linear model order_ma : order of moving average (MA) linear model maxlag : maximim lag Return: --------- mdl : model object ''' if int(np.mean(y) != 0): for t in range(len(y)): y[t] = y[t] - np.mean(y) u = np.random.randn(len(y), 2) mdl = smt.ARMA(y, order=(order_ar, order_ma)).fit(maxlag=maxLag, method='mle', trend='nc', exog=u) print(mdl.summary()) return mdl
def ARMA_Predict(df, order, symbols, t = 30) : predict = list(0 for i in range(len(symbols))) for i in range(len(symbols)) : model = smt.ARMA(df[symbols[i]], order = (order[i][0], order[i][1])).fit(method = 'mle', trend = 'c') predict[i] = model.forecast(steps = t)[0].reshape((t, 1)) predict[i] = np.mean(predict[i]) return predict
def ARMA_fit(endogenous, order, exog=None, trend='c'): ''' 例如:ARMA_fit(y,[3,0,1]) 使用results.summary()打印结果,或者使用results.params或者 .resid/.pvalues :param endogenous: :param order: :param trend: :return: ''' return tsa.ARMA(endogenous, order, exog=exog).fit(trend=trend)
def ma_model(code='399001'): Y = get_stock_data(code) max_lag = 30 result = smt.ARMA(Y.values, order=(0, 3)).fit(maxlag=max_lag, method='mle', trend='nc') print(result.summary()) resid = pd.Series(result.resid, index=Y.index) ts_plot(resid, lags=max_lag, title=code + 'MA拟合残差')
def _getStartingVals(self): if self._data is not None: if self._include_constant: c = 'c' else: c = 'nc' try: model = sm.ARMA( self._data.values, (self._order['AR'], self._order['MA'])).fit(trend=c) self._startingValues = model.params except ValueError: self._startingValues = None else: self._startingValues = np.zeros((self._pnum, )) + 0.5
def fit_arma_model_and_estimate_order( self, data, order=(0, 1), maxlag=None, method='mle', trend='nc', burnin=0 ): if maxlag is None: maxlag = self.__n_lags log('Fitting & estimating the ARMA model to the given data...') mdl = smt.ARMA(data, order=order).fit( maxlag=maxlag, method=method, trend=trend, burnin=burnin ) logn('[Done]') logn(mdl.summary()) return mdl.arparams, mdl.k_ar, mdl.maparams, mdl.k_ma
def _gdp_vol_estimate(self, variables): """ This function estimates long-run volatilities of GDP growht. To esimate such long-run volatility, I use a 2-sided rolling window to filter raw volatility. Returns: -------- rolling_vol: pd.Series(float) The rolling estimate of volatility """ raw_vol = tsa.ARMA(variables['gdp'], order=(1, 0)).fit(disp=False).resid abs_vol = np.abs(raw_vol) cycle, trend = sm_filters.tsa.filters.hpfilter(abs_vol) return trend
def analyse_arma_p_q_best_ic(self, p=1, q=1): n = 5000 burns = 2000 a, b, rts = self.get_sample_data( m=SerialCorrelation.ModelType.arma, p=p, q=q, n=n, b=burns ) self.g.tsplot(rts, lags=self.__n_lags, saveas='arma{}{}.png'.format(p, q) ) # pick best order by minimum ic - aic or bic # smallest ic value wins best_ic = np.inf best_order = None best_mdl = None rng = range(5) for i in rng: for j in rng: try: tmp_mdl = smt.ARMA(rts, order=(i, j)).fit( method='mle', trend='nc' ) tmp_ic = tmp_mdl.bic # using bic here if tmp_ic < best_ic: best_ic = tmp_ic best_order = (i, j) best_mdl = tmp_mdl except: continue logn(best_mdl.summary()) logn('using BIC', '='*20, sep='\n') logn('true order: ({}, {})'.format(p, q)) logn('true alphas = {}'.format(a)) logn('true betas = {}'.format(b)) logn('ic: {:6.5f} | estimated order: {}'.format(best_ic, best_order)) logn('estimated alphas = {}'.format(best_mdl.arparams)) logn('estimated betas = {}'.format(best_mdl.maparams)) # analysing the model residuals with the estimated information # the residuals should be a white noise process with no serial # correlation for any lag, if this is the case then we can say # that the best model has been fit to explain the data self.g.tsplot(best_mdl.resid, lags=self.__n_lags, saveas='arma{}{}_residuals.png'.format( best_order[0], best_order[1] ) )
def getbestar(self, data, symbol): max_lag = 30 mdl = smt.AR(data[symbol]).fit(maxlag=max_lag, ic='aic', trend='nc') best_order = smt.AR(data[symbol]).select_order(maxlag=max_lag, ic='aic', trend='nc') self.label_dikiful_2.setText( 'best estimated lag order = {}'.format(best_order)) max_lag = best_order Y = data[symbol] best_mdl = smt.ARMA(Y, order=(0, 3)).fit(maxlag=max_lag, method='mle', trend='nc') if self.checkBox_forecast.isChecked(): self.forecast(data, symbol, best_mdl, int(self.sdiffspinBox_2.text())) elif not self.checkBox_forecast.isChecked(): self.tsplot(best_mdl.resid, symbol)
def MA(): n = int(1000) # set the AR(p) alphas equal to 0 alphas = np.array([0.]) betas = np.array([0.6]) # add zero-lag and negate alphas ar = np.r_[1, -alphas] ma = np.r_[1, betas] ma1 = smt.arma_generate_sample(ar=ar, ma=ma, nsample=n) _ = tsplot(ma1, lags=30) max_lag = 30 mdl = smt.ARMA(ma1, order=(0, 1)).fit(maxlag=max_lag, method='mle', trend='nc') print(mdl.summary())
def _transform_arma(self, variables, order): """ uses an arma(p, 0, q) model to compute vol Paramters: ----------- variables: pd.DataFrame(float) The (log) realized volatility. order: tuple(int, int) The p and q terms for the ARMA(p,q) process Returns: -------- res.fittedvalues pd.series(float) The fitted values of the arma model """ res = tsa.ARMA(variables['vol'], order=order).fit(disp=False) return res.fittedvalues
def _get_best_arma(TS): best_aic = np.inf best_order = None best_mdl = None for i in range(5): for j in range(5): try: tmp_mdl = smt.ARMA(TS, order=(i, j)).fit(method='mle', trend='nc', freq=freq) tmp_aic = tmp_mdl.aic if tmp_aic < best_aic: best_aic = tmp_aic best_order = (i, j) best_mdl = tmp_mdl except: continue p('aic: {:6.5f} | order: {}'.format(best_aic, best_order)) return best_aic, best_order, best_mdl
def arima_pred(df, arima_inds): arima_preds = df.copy() for col in df.columns: arr = df[col].dropna().values arma = tsa.ARMA(arr, order=(0, 1, 0)) results = arma.fit() res = results.predict(0, len(arr) - 1) for item, ind in zip(res, arima_preds.index): if np.isnan(arima_preds[col].loc[ind]): continue arima_preds[col].loc[ind] = item ts = arima_preds[col] mu = ts.mean() sigma = ts.std() for idx in ts.index: if ts.loc[idx] > 3 * sigma + mu: # could be changed to 2*sigma + mu arima_inds[col].loc[idx] = 1 return arima_inds
def evaluate_ARMA_models(y, AR_lim, MA_lim): orders, aics = list(), list() for i in range(1, AR_lim + 1): for j in range(0, MA_lim + 1): try: mdlt = smt.ARMA(y, order=(i, j)).fit(method='mle', trend='nc', disp=-1) orders.append((i, j)) aics.append(mdlt.aic) except ValueError: print(i, j, ' not convergent.') except np.linalg.LinAlgError as e: if "SVD did not converge" in str(e): print(i, j, ' not convergent.') else: raise assert len(orders) == len(aics), 'Error: Different lengths.' print('Number of convergent models = ', len(orders)) aics, orders = zip(*sorted(zip(aics, orders))) for a, b in zip(aics[:5], orders[:5]): print(a, b) return orders
def MA3(): import os import sys import pandas as pd import pandas_datareader.data as web import numpy as np import statsmodels.formula.api as smf import statsmodels.tsa.api as smt import statsmodels.api as sm import scipy.stats as scs from arch import arch_model import matplotlib.pyplot as plt import matplotlib as mpl get_ipython().magic('matplotlib inline') p = print end = '2017-01-01' start = '2008-01-01' get_px = lambda x: web.DataReader(x, 'yahoo', start=start, end=end)[ 'Adj Close'] symbols = ['SPY', 'TLT', 'MSFT'] # raw adjusted close prices data = pd.DataFrame({sym: get_px(sym) for sym in symbols}) # log returns lrets = np.log(data / data.shift(1)).dropna() def tsplot(y, lags=None, figsize=(10, 8), style='bmh'): if not isinstance(y, pd.Series): y = pd.Series(y) with plt.style.context(style): fig = plt.figure(figsize=figsize) #mpl.rcParams['font.family'] = 'Ubuntu Mono' layout = (3, 2) ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2) acf_ax = plt.subplot2grid(layout, (1, 0)) pacf_ax = plt.subplot2grid(layout, (1, 1)) qq_ax = plt.subplot2grid(layout, (2, 0)) pp_ax = plt.subplot2grid(layout, (2, 1)) y.plot(ax=ts_ax) ts_ax.set_title('Time Series Analysis Plots') smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5) smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5) sm.qqplot(y, line='s', ax=qq_ax) qq_ax.set_title('QQ Plot') scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax) plt.tight_layout() return # Fit MA(3) to SPY returns max_lag = 30 Y = lrets.SPY mdl = smt.ARMA(Y, order=(0, 3)).fit(maxlag=max_lag, method='mle', trend='nc') p(mdl.summary()) _ = tsplot(mdl.resid, lags=max_lag)
ibm_df.Close_Price.plot() # Plot ACF and PACF ibm_df = ibm_df.dropna() plot_acf(ibm_df.Close_Price, lags=50) plot_pacf(ibm_df.Close_Price, lags=50) # QQ plot and probability plot sm.qqplot(ibm_df['Close_Price'], line='s') # Optimize ARMA parameters aicVal = [] for ari in range(1, 3): for maj in range(0, 3): arma_obj = smtsa.ARMA(ibm_df.Close_Price.tolist(), order=(ari, maj)).fit(maxlag=30, method='mle', trend='nc') aicVal.append([ari, maj, arma_obj.aic]) arma_obj_fin = smtsa.ARMA(ibm_df.Close_Price.tolist(), order=(1, 0)).fit(maxlag=30, method='mle', trend='nc') ibm_df['ARMA'] = arma_obj_fin.predict() arma_obj_fin.summary() # Plot the curves f, axarr = plt.subplots(1, sharex=True) f.set_size_inches(5.5, 5.5) ibm_df['Close_Price'].iloc[1:].plot(color='b', linestyle='-', ax=axarr) ibm_df['ARMA'].iloc[1:].plot(color='r', linestyle='--', ax=axarr)
# Generate MA(1) dataset ar = np.r_[1, -0] ma = np.r_[1, 0.7] ma1_data = smtsa.arma_generate_sample(ar=ar, ma=ma, nsample=n) plotds(ma1_data) # Generate MA(2) dataset ar = np.r_[1, -0] ma = np.r_[1, 0.6, 0.7] ma2_data = smtsa.arma_generate_sample(ar=ar, ma=ma, nsample=n) plotds(ma2_data) # Generate MA(3) dataset ar = np.r_[1, -0] ma = np.r_[1, 0.6, 0.7, 0.5] ma3_data = smtsa.arma_generate_sample(ar=ar, ma=ma, nsample=n) plotds(ma3_data) # Build MA(1) model ma1 = smtsa.ARMA(ma1_data.tolist(), order=(0, 1)).fit(maxlag=30, method="mle", trend="nc") ma1.summary() # Build MA(3) model ma3 = smtsa.ARMA(ma3_data.tolist(), order=(0, 3)).fit(maxlag=30, method="mle", trend="nc") ma3.summary()
mean1, mean2 = djia_df.iloc[:125].Close.mean(), djia_df.iloc[125:].Close.mean() var1, var2 = djia_df.iloc[:125].Close.var(), djia_df.iloc[125:].Close.var() print('mean1=%f, mean2=%f' % (mean1, mean2)) print('variance1=%f, variance2=%f' % (var1, var2)) # ADF Test from statsmodels.tsa.stattools import adfuller adf_result = adfuller(djia_df.Close.tolist()) print('ADF Statistic: %f' % adf_result[0]) print('p-value: %f' % adf_result[1]) # QQ plot and probability plot sm.qqplot(djia_df['Close'], line='s') # Optimize ARMA parameters (Will return a non-stationary error) arma_obj = smtsa.ARMA(djia_df['Close'].tolist(), order=(1, 1)).fit(maxlag=30, method='mle', trend='nc') #Let us plot the original time series and first-differences first_order_diff = djia_df['Close'].diff(1).dropna() fig, ax = plt.subplots(2, sharex=True) fig.set_size_inches(5.5, 5.5) djia_df['Close'].plot(ax=ax[0], color='b') ax[0].set_title('Close values of DJIA during Jan 2016-Dec 2016') first_order_diff.plot(ax=ax[1], color='r') ax[1].set_title('First-order differences of DJIA during Jan 2016-Dec 2016') # plot signal plotds(first_order_diff, nlag=50) adf_result = adfuller(first_order_diff) print('ADF Statistic: %f' % adf_result[0]) print('p-value: %f' % adf_result[1])
plot_pacf(dado_ar3['Preco']); # %% [markdown] # ## Ajuste dos modelos AR # %% dado_ar1_p1 = dado_ar1[0:500] dado_ar1_p2 = dado_ar1[500:600] # %% import statsmodels.tsa.api as smtsa # %% modelo_ar1 = smtsa.ARMA(dado_ar1_p1['Preco'], order=(1, 0)).fit() # %% modelo_previsto1 = modelo_ar1.predict(start=500,end=599) # %% plt.figure(figsize=(12,8)) plt.plot(dado_ar1_p1['Data'],dado_ar1_p1['Preco']) plt.plot(dado_ar1_p2['Data'],dado_ar1_p2['Preco']) plt.plot(dado_ar1_p2['Data'], modelo_previsto1,'r.') # %% dado_ar3_p1 = dado_ar3[0:500]
def main(job_no, sex, chroms, rank, dir): start_time = time() if not os.path.exists(dir): os.makedirs(dir) LOGGER.log_file_path = dir + "/" + str( os.path.basename(__file__)) + job_no + ".log" # change LOGGER.log_args() LOGGER.log_message(get_file_hexdigest(__file__), label="Hex digest of script.".ljust(17)) try: LOGGER.log_message(str(os.environ['CONDA_DEFAULT_ENV']), label="Conda environment.".ljust(17)) except KeyError: pass LOGGER.log_message('Name = ' + np.__name__ + ', version = ' + np.__version__, label="Imported module ") LOGGER.log_message('Name = ' + pd.__name__ + ', version = ' + pd.__version__, label="Imported module ") LOGGER.log_message('Name = ' + scipy.__name__ + ', version = ' + scipy.__version__, label="Imported module ") LOGGER.log_message('Name = ' + statsmodels.__name__ + ', version = ' + statsmodels.__version__, label="Imported module ") LOGGER.log_message('Name = ' + sklearn.__name__ + ', version = ' + sklearn.__version__, label="Imported module ") pd.set_option('display.max_columns', None) if chroms is None: chroms = np.arange(1, 23).astype(str).tolist() else: chroms = chroms.split(',') if rank: LOGGER.log_message("%1d" % rank, label="Rank of model to select (best=0).".ljust(30)) for chrom in chroms: csv_name = dir + '/recomb_table_all_sexes_ch' + chrom + '.csv' infile = open(csv_name, 'r') LOGGER.input_file(infile.name) infile.close() data_table = pd.read_csv(csv_name, sep=',', index_col=0) data_table = recombination.correct_missing_data( data_table, 'LOCF', sex) std_col = 'stdrate_' + sex std_rates = data_table[std_col].values variants_profiled = data_table.iloc[:, np.arange(5, 17)] variant_counts = variants_profiled.sum(axis=1) var_rates = variant_counts / 10000 print('\n\nChromosome number = ' + chrom) print('Avge. mutation rate = ', np.mean(var_rates)) xvals = std_rates.reshape(-1, 1) lmodel = LinearRegression() lmodel.fit(xvals, var_rates) residuals = var_rates - lmodel.predict(xvals) sys.stdout.flush() print('Slope, intercept, mean of residuals = ', '%.8f' % lmodel.coef_[0], '%.8f' % lmodel.intercept_, '%.12f' % np.mean(residuals)) orders = recombination.evaluate_ARMA_models(residuals, 10, 4) best_order = orders[rank] best_mdl = smt.ARMA(residuals, order=best_order).fit(method='mle', trend='nc', disp=0) print(best_mdl.summary()) outfile_name = dir + '/ARMA_model_ch' + chrom + '_' + job_no + '.pklz' recombination.save_model_details(best_mdl, outfile_name) outfile = open(outfile_name, 'r') LOGGER.output_file(outfile.name) outfile.close() duration = time() - start_time LOGGER.log_message("%.2f" % (duration / 60.), label="run duration (minutes)".ljust(30))
# # ## Let's use a systematic approach to finding the order of AR and MA processes. # In[23]: # pick best order by aic # smallest aic value wins best_aic = np.inf best_order = None best_mdl = None rng = range(5) for i in rng: for j in rng: try: tmp_mdl = smt.ARMA(arma22, order=(i, j)).fit(method='mle', trend='nc') tmp_aic = tmp_mdl.aic if tmp_aic < best_aic: best_aic = tmp_aic best_order = (i, j) best_mdl = tmp_mdl except: continue print('aic: {:6.5f} | order: {}'.format(best_aic, best_order)) # ## We've correctly identified the order of the simulated process as ARMA(2,2). # # ### Lets use it for the sales time-series. #