class ARIMAModelResult: def __init__(self, autoregressive_periods, integrated_order, moving_average_model_periods, training_data, test): self.autoregressive_periods = autoregressive_periods self.integrated_order = integrated_order self.moving_average_model_periods = moving_average_model_periods self.model = ARIMA(training_data, order=( self.autoregressive_periods, self.integrated_order, self.moving_average_model_periods ) ) self.fit = self.model.fit() self.aic = self.fit.aic self.predictions = self.fit.forecast(steps=len(test))[0] self.model_fitness = mean_squared_error(test, self.predictions) def __eq__(self, other): return self.model_fitness == other.model_fitness def __lt__(self, other): return self.model_fitness < other.model_fitness def __gt__(self, other): return self.model_fitness > other.model_fitness def __str__(self): return "Autoregressive periods: {}\nIntegraded Order: {}\nMoving Average Model Periods: {}\n Predictions: {}\nMSE: {}".format( self.autoregressive_periods, self.integrated_order, self.moving_average_model_periods, self.predictions, self.model_fitness )
def ARIMA_forcast2(self): # this approach forecast 1 data pt at a time, then add the new forecast datapoint to the training data # then repeat import warnings warnings.filterwarnings('ignore') # test without taking log of data # using rolling avg y = vr_df2_ts.values train = vr_df2_ts.values[286:574] prediction = list() for t in range(288): modelY = ARIMA(y, order=(1,1,1)) results = modelY.fit(disp=-1) out = results.forecast() yhat = out[0] prediction.append(yhat) y = np.append(y,train[t]) forecast = pd.Series(prediction,index=pd.date_range(start='2017-02-09 00:00:00', periods=288,freq='5min')) exog = vr_df2_ts.iloc[286:574] exog.set_index(pd.date_range(start='2017-02-09 00:00:00', periods=288,freq='5min'),inplace=True) plt.plot(vr_df2_ts) plt.plot(exog,'g') plt.plot(forecast,'r')
def arima_predict(train_dat, n_predictions, p=2, d=0, q=0): arima = ARIMA(np.array(train_dat).astype(np.float), [p, d, q]) diffed_logged_results = arima.fit(trend='c', disp=False) preds = diffed_logged_results.predict(len(train_dat), len(train_dat) + n_predictions - 1, exog=None, dynamic=False) return preds
def forecast_by_cluster(self, hold_out_n, n_ahead, order, exog): dfit = self.ds_agg_by_c efit = efor = None if hold_out_n > 0: # hold out validation required dfit = dfit[:-hold_out_n] if (exog is not None): efit = exog[:-hold_out_n] efor = exog[-hold_out_n:] else: if (exog is not None): efit = exog[:-n_ahead] efor = exog[-n_ahead:] ds_c_for = np.zeros((n_ahead, self.n_clusters)) for c in tqdm(range(self.n_clusters)): cdfit = dfit[:,c] if sum(cdfit) == 0: ds_c_for[:,c] = 0 continue m = ARIMA(cdfit, exog = efit, order = order) mf = m.fit() f = mf.forecast(n_ahead, exog = efor, alpha = .95)[0] ds_c_for[:,c] = f self.ds_c_for = ds_c_for
def arimamodel(ts): ts_log, ts_log_diff = trend(ts) model = ARIMA(ts_log, order = (2,1,2)) result_ARIMA = model.fit(disp = -1) m = ARIMA(ts, order = (2,1,2)).fit() arimares = ARMAResults(m, params = '') pre = arimares.forcast(steps = 60) # pre = m.predict('20150901', '20151230', dynamic = True) print pre # prediction back to the original scale predictions_ARIMA = backorg(result_ARIMA, ts_log) plt.plot(predictions_ARIMA) # print (predictions_ARIMA - ts)[40:80] plt.plot(ts, color = 'red') # plt.plot(ts_log_diff) # plt.plot(result_ARIMA.fittedvalues, color = 'red') plt.title('RSS: %.4F' % np.sum((result_ARIMA.fittedvalues - ts_log_diff)**2)) plt.show()
def get_grouped_data(self, forecast=False): cdf = self.cumulative_sum() gdf = self.group_by('M') if cdf.shape[0] > gdf.shape[0]: df = cdf.to_frame() df.columns = ['cumulative sum'] df['total added'] = gdf.to_frame()['event'] else: df = gdf.to_frame() df.columns = ['total added'] df['cumulative sum'] = cdf.to_frame()['event'] if forecast: mtotals = pd.to_numeric(df['cumulative sum'], downcast='float') model = ARIMA(mtotals, order=(10,1,0)) model_fit = model.fit(disp=0) forecast = model_fit.forecast(steps=12) dates = pd.date_range('2017-04-30', '2018-06-01', freq='M') records = zip([x.to_datetime() for x in dates], forecast[0]) ndf = pd.DataFrame.from_records(records) ndf.columns = ['date', 'forecast'] ndf.set_index(['date'], inplace=True) df = pd.concat([df, ndf], axis=1) return df
def mamodel(ts): ts_log, ts_log_diff = trend(ts) model = ARIMA(ts_log, order = (0,1,1)) result_MA = model.fit(disp = -1) plt.plot(ts_log_diff) plt.plot(result_MA.fittedvalues, color = 'red') plt.title('RSS: %.4F' % np.sum((result_MA.fittedvalues - ts_log_diff)**2)) plt.show(block = False)
def armodel(ts): ts_log, ts_log_diff = trend(ts) model = ARIMA(ts_log, order = (1,1,0)) result_AR = model.fit(disp = -1) plt.plot(ts_log_diff) plt.plot(result_AR.fittedvalues, color = 'red') # pdb.set_trace() plt.title('RSS: %.4F' % np.sum((result_AR.fittedvalues - ts_log_diff)**2)) plt.show(block = False)
def ARIMA_fit(self): # order=(p,d,q) AR and MA can also be modeled separately by enter 0 for either p or q model = ARIMA(ts_log, order=(5,1,5)) self.results_ARIMA = model.fit(disp=-1) print(results_ARIMA.summary()) plt.plot(ts_log_diff) plt.plot(results_ARIMA.fittedvalues, color='r') plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log_diff['in_tpkts'])**2))
def ARIMA_fun( data ): lag_pacf = pacf( data, nlags=20, method='ols' ) lag_acf, ci2, Q = acf( data, nlags=20 , qstat=True, unbiased=True) model = ARIMA(orig_data, order=(1, 1, int(ci2[0]) ) ) results_ARIMA = model.fit(disp=-1) plt.subplot(121) plt.plot( data ) plt.plot(results_ARIMA.fittedvalues) #plt.show() return results_ARIMA.fittedvalues
def fit(self): if len(self.df) < self.t_window: return None model = ARIMA(self.df, order=(2, 1, 1)) results_ARIMA = model.fit(disp=-1) forecast = results_ARIMA.predict(start = self.t_window, end= self.t_window+2, dynamic= True) forecast = forecast.cumsum() predictions_ARIMA_log = pd.Series(self.df.ix[self.t_window-1], index=forecast.index) predictions_ARIMA_log = predictions_ARIMA_log.add(forecast,fill_value=0) predictions_ARIMA = np.exp(predictions_ARIMA_log) #print self.df return predictions_ARIMA
def objfunc(order, *params): series = params try: mod = ARIMA(series, order, exog=None) with warnings.catch_warnings(): warnings.simplefilter("ignore") res = mod.fit(disp=0, solver='bfgs', maxiter=5000) except: return float('inf') if math.isnan(res.aic): return float('inf') return res.aic
def pridictNextNdays(self,train): timeSerize = train[self.selected] timeSerize = timeSerize[self.start_train:self.end_train] model = ARIMA(timeSerize, order=(self.p,self.d,self.q), freq='D') # build a model fitting = model.fit(disp=False) forecast, fcasterr, conf_int = fitting.forecast(steps=self.next_ndays, alpha=.05) # params = fitting.params # residuals = fitting.resid # p = fitting.k_ar # q = fitting.k_ma # k_exog = fitting.k_exog # k_trend = fitting.k_trend # forecast = _arma_predict_out_of_sample(params,self.next_ndays,residuals, p, q, k_trend, k_exog, endog=timeSerize, exog=None, start=len(timeSerize)) return forecast
def testArima(self,train): realSerize = train[self.selected] timeSerize = realSerize[self.start_train:self.end_train] realData = train[self.selected][self.end_train:self.next_ndays] model = ARIMA(timeSerize, order=(self.p,self.d, self.q)) # build a model fitting = model.fit(disp=False) forecast, fcasterr, conf_int = fitting.forecast(steps=self.next_ndays, alpha=.05) # params = fitting.params # residuals = fitting.resid # p = fitting.k_ar # q = fitting.k_ma # k_exog = fitting.k_exog # k_trend = fitting.k_trend # forecast = _arma_predict_out_of_sample(params,self.next_ndays,residuals, p, q, k_trend, k_exog, endog=timeSerize, exog=None, start=len(timeSerize)) return {'real':list(realSerize)[self.end_train:self.end_train+self.next_ndays],'pridiction':forecast}
def predict_arima_next_days(self, item): ts = df_train[item] ts = ts.sort_index() # sorting index Date ts_last_day = ts[self.fc] # real last data ts = ts[0:self.fc] # index 0 until last data - 1 model = ARIMA(ts, order=(self.p, self.d, self.q)) # build a model fitting = model.fit(disp=False) # n_days forecasting forecast, fcasterr, conf_int = fitting.forecast(steps=self.n_days, alpha=.05) # ts: history until 1 day before self.fc # ts[self.fc]: last day # forecast: 1 day forecast (time equalto ts[self.fc]) return ts, ts_last_day, forecast
def arima(ts, forecast_window): logger.info(ts) start = int(ts.count() - 1) end = int(start + forecast_window) ts_log = np.log(ts) model = ARIMA(ts_log, order=(0, 1, 2)) results = model.fit(disp=-1) prediction = results.predict(start=start, end=end, dynamic=True) future = pd.Series(prediction, copy=True) cumsum = future.cumsum() prediction_future = future.add(ts_log.ix[-1]) prediction_future = prediction_future.add(cumsum) ts_future = np.exp(prediction_future) return ts_future
def predictFutureProfit(df, forward): results = {} for asset in get_assets(df): ts = df[asset] ts_log = np.log(ts) model = ARIMA(ts_log, order=(1, 1, 0)) results_ARIMA = model.fit(disp=-1) predictions_diff = results_ARIMA.predict(2, len(ts.index)-1, dynamic=True) predictions_diff_cumsum = predictions_diff.cumsum() predictions_log = pd.Series(ts_log.ix[0], index=ts_log.index) predictions_log = predictions_log.add(predictions_diff_cumsum,fill_value=0) predictions = np.exp(predictions_log) results[asset] = predictions[-1] return results
def arima(self): kl = self.get_kline() cp = self.get_close_price(kl) date = self.get_date(kl) #t = datetime.fromtimestamp(date[-1].timestamp()+24*60*60) t = date[-1] + timedelta(days=int(self.day_history/5)) #days seconds ... print("predict date:", date[-1],"--->", t) dta = pd.Series(cp, index=date) print(dta) model=ARIMA(dta,order=(4,1,3)) #P D Q result=model.fit() pred=result.predict( date[-10], t,dynamic=True,typ='levels') plt.figure(figsize=(12,8)) plt.plot(dta, 'ro-') plt.xticks(rotation=45) plt.plot(pred, 'go-') plt.show()
def fitArima(ts): import statsmodels.api as sm logged_ts = np.log(ts) diffed_logged_ts = (logged_ts - logged_ts.shift(7))[7:] p = 0 d = 1 q = 1 arima = ARIMA(diffed_logged_ts, [p, d, q], exog=None, freq='D', missing='none') diffed_logged_results = arima.fit(trend='c', disp=False) predicted_diffed_logged = diffed_logged_results.predict(exog=None, dynamic=False) #a=pd.date_range(diffed_logged_ts.index[1], periods=90, freq='D') predicted_diffed_logged_ts = pd.Series(predicted_diffed_logged, index=diffed_logged_ts.index[d:]) predicted_diffed_logged_ts = np.exp(logged_ts.shift(7) + diffed_logged_ts.shift(d) + predicted_diffed_logged_ts) concatenated = pd.concat([ts, predicted_diffed_logged_ts], axis=1, keys=['original', 'predicted']) #a= concatenated #a.plot() #plt.show() return concatenated
def arima_model(accounts): """Fit ARIMA models for each account""" # Model each account account_models = {} for account_type, account in accounts: account_data = accounts[(account_type, account)] account_data.name = account # ARIMA model order is unknown, so find the highest order that can be fit order = 0 modeled = False while not modeled and order < len(ARIMA_ORDERS): try: model = ARIMA(account_data, order=ARIMA_ORDERS[order]) results = model.fit() modeled = True account_models[(account_type, account)] = results except (ValueError, np.linalg.LinAlgError): order += 1 return account_models
def ARIMA_forcast3(self): # load dataset series = pd.Series(vr_df['ACTIVE_FLOWS'][0:7000]) # seasonal difference X = series.values cycle = 288 #2016 differenced = difference(X, cycle) # fit model model = ARIMA(differenced, order=(1,1,1)) model_fit = model.fit(disp=0) # multi-step out-of-sample forecast forecast = model_fit.forecast(steps=2016)[0] # invert the differenced forecast to something usable history = [x for x in X] step = 1 forecast_values = [] for yhat in forecast: inverted = inverse_difference(history, yhat, cycle) #print('Day %d: %f' % (day, inverted)) forecast_values.append(inverted) history.append(vr_df['ACTIVE_FLOWS'][7000+step-1]) step += 1
def ARIMA_forecast4(self): # parameters num_train_init = 7318 num_forecast = 12 #one day = 288 data points cycle = 288 #for a total 288 samples per day startdate = vr_df.index[num_train] field = 'DELETED_FLOWS' # array of predicted values forecast_values = [] for i in range(0,int(len(vr_df)/num_forecast)): # check array for out of bound num_train_current = i*num_forecast+num_train_init if ((num_train_current) > len(vr_df)): break # load dataset series = pd.Series(vr_df[field][0:num_train_current]) # Make data stationary: seasonal difference X = series.values differenced = difference(X, cycle) # fit model model = ARIMA(differenced, order=(1,1,1)) model_fit = model.fit(disp=0) # multi-step out-of-sample forecast forecast = model_fit.forecast(steps=num_forecast)[0] # invert the differenced forecast to something usable history = [x for x in X] step = 1 for yhat in forecast: inverted = inverse_difference(history, yhat, cycle) forecast_values.append(inverted) #append actual data try: history.append(vr_df[field][num_train_current+step-1]) except: # reached the end of actual data array, use forecasted values to estimate history.append(inverted) step += 1
def previsao_matematica(reservatId, data): seriesArray = Series.from_array(predict_info.getSeries(reservatId, data)) seriesValues = seriesArray.values mathDict = {'calculado': False, 'volumes': [], 'dias': 0} #if isNonStationary(seriesValues) == True: days_in_year = 1 differenced = predict_info.difference(seriesValues, days_in_year) # fit model model = ARIMA(differenced, order=(1,0,1)) model_fit = model.fit(disp = -1) # multi-step out-of-sample forecast forecast = model_fit.forecast(steps=180)[0] # invert the differenced forecast to something usable mathDict['calculado'] = True history = [x for x in seriesValues] for yhat in forecast: inverted = predict_info.inverse_difference(history, yhat, days_in_year) history.append(inverted) if inverted >= 0.0: mathDict['volumes'].append("%.4f" % round((inverted), 4)) mathDict['dias'] = mathDict['dias'] + 1 return mathDict
def _set_model_and_fit(self, train_data, order_): model = ARIMA(train_data, order=order_) model_fit = model.fit(disp=0) return model_fit
size = int(len(ts_month_log) - 13) train, test = ts_month_log[0:size], ts_month_log[size:len(ts_month_log)] history = [x for x in train] predictions = list() selisih = list() mapee = list() print "Ini Data Test" print print test print print train print "Printing Predicted vs Expected Values" print for t in range(len(test)): model = ARIMA(history, order=(0, 1, 1)) mode_fit = model.fit(disp=-1) output = mode_fit.forecast() yhat = output[0] predictions.append(float(yhat)) obs = test[t] history.append(obs) selisih.append(yhat - obs) mapee.append(obs - yhat / obs) print "Predicted=%f, Expected=%f" % (np.exp(yhat), np.exp(obs)) error = mean_squared_error(test, predictions) RMSE = sqrt(error) ME = sum(selisih) / len(ts_month_log) MAE = mean_absolute_error(test, predictions) MPE = 1 / sum(mapee) * 100 MAPE = 100 / sum(mapee) * 100
data = data[:].astype(np.float) data.tail() data.plot() plt.show() from statsmodels.graphics.tsaplots import plot_acf, plot_pacf plot_acf(data) plot_pacf(data) plt.show() from statsmodels.tsa.arima_model import ARIMA from statsmodels.tsa.arima_model import ARIMAResults model = ARIMA(data, order = (1, 1, 0)) model_fit = model.fit(trend = 'nc', full_output = True, disp = 1) print(model_fit.summary()) model_fit.plot_predict() fore = model_fit.forecast(steps = 1) print(fore) # 2018년 1월 1일 전력량 763473.4587
train, test = split_dataset(df_modal_price_supervised) # evaluate model and get scores n_input = 5 score, scores = evaluate_model(train, test, n_input) # summarize scores summarize_scores('lstm', score, scores) # plot scores district_data = df[:140] district_data.head() district_data['Date'] = pd.to_datetime(district_data.arrival_date, dayfirst=True) district_data['Day'] = district_data.Date.dt.day district_data['month'] = district_data.Date.dt.month district_data['year'] = district_data.Date.dt.year district_data['day_of_week'] = district_data.Date.dt.dayofweek district_data['weekend'] = district_data.Date.apply(weekend) district_data.sort_values(by='Date', inplace=True) district_data.head() from statsmodels.tsa.arima_model import ARIMA from matplotlib import pyplot as plt model = ARIMA(df.modal_price, order=(5, 0, 4)) model_fitted = model.fit(disp=-1) plt.plot(df.modal_price) plt.plot(model_fitted.fittedvalues, color='red') print(model_fitted.summary())
print(f"Coefficients: {model_fit.params}") predictions = model_fit.predict(start=len(train), end=len(train)+len(test)-1, dynamic=False) plltt(train['Close_log_diff'],test['Close_log_diff'],predictions,'Auto Regression model') newall(test['Close_log_diff'],predictions,'AR model') # ARIMA p=d=q=range(0,5) import itertools val = list(itertools.product(p,d,q)) print("Combinations of p,d,p for ARIMA to get low AIC ") for param in val: try: model_arima = ARIMA(test['Close_log_diff'],order = param) model_arima_fit = model_arima.fit() print(param,model_arima_fit.aic) except: continue model = ARIMA(test['Close_log_diff'], order=(2,2,0), freq=test['Close_log_diff'].index.inferred_freq) results_ARIMA = model.fit(disp=-1) plt.plot(test['Close_log_diff']) plt.plot(results_ARIMA.fittedvalues, color='red') plt.show() print(results_ARIMA.summary()) # AUTO Arima stepwise_model = auto_arima(timeseries_dfnew['Close'], start_p=1, start_q=1,
from statsmodels.tsa.arima_model import ARIMA from common_calculations import get_the_stationary_series from first_part_calculations import show_statistical_data, ma_by_ar_resid # отримання стаціонарного часового ряду stationary_series = get_the_stationary_series() # АРКС(2) arks_model = ARIMA(stationary_series, order=(2, 0, 0)) model = arks_model.fit(disp=0) AR_resid = model.resid split = len(stationary_series) - int(0.2 * len(stationary_series)) train, test = stationary_series[0:split], stationary_series[split:] pred = model.predict(len(test)) show_statistical_data(train, model, pred) ma_by_ar_resid(AR_resid, 2, 0, 4, 'АРКС(2,4)') arks_n5_PKC = AR_resid.rolling(5).mean().fillna(AR_resid[:5].mean()) ma_by_ar_resid(arks_n5_PKC, 2, 0, 3, 'АРКС(2,3) із застосуванням власного простого КС, при N=5') arks_n10_PKC = AR_resid.rolling(10).mean().fillna(AR_resid[:10].mean()) ma_by_ar_resid(arks_n10_PKC, 2, 0, 7, 'АРКС(2,7) із застосуванням власного простого КС, при N=10') arks_n5_EKC = AR_resid.ewm(5).mean() ma_by_ar_resid( arks_n5_EKC, 2, 0, 3, 'АРКС(2,3) із застосуванням власного експоненційного КС, при N=5')
ax1 = fig.add_subplot(211) fig = plot_acf(df['Seasonal First Difference'].iloc[13:], lags=40, ax=ax1) #plt.show() ax2 = fig.add_subplot(212) fig = plot_pacf(df['Seasonal First Difference'].iloc[13:], lags=40, ax=ax2) #plt.show() # For non-seasonal data #p=1, d=1, q=0 or 1 from statsmodels.tsa.arima_model import ARIMA model = ARIMA(df['electricity_available'], order=(1, 1, 1)) model_fit = model.fit() model_fit.summary() df['forecast'] = model_fit.predict(start=90, end=103, dynamic=True) df[['electricity_available', 'forecast']].plot(figsize=(12, 8)) #plt.show() import statsmodels.api as sm model = sm.tsa.statespace.SARIMAX(df['electricity_available'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12)) results = model.fit()
#data for arima trainX = bt['price_feature'][bt['price_feature'].index < '2017-09-01'] testX = bt['price_feature'][bt['price_feature'].index >= '2017-09-01'] #ARIMA print('\n\n Running Model type: ARIMA') plot_acf(bt['price_feature'].diff().values[1:], lags=50) plt.show() plot_pacf(bt['price_feature'].diff().values[1:], lags=50) plt.show() predX = list() history = list(bt['price_feature'].values) model = ARIMA(history, order=(1, 1, 1)) model_fit = model.fit(disp=0) train_error = math.sqrt(sum(model_fit.resid**2) / model_fit.resid.shape[0]) for t in range(len(testX)): model = ARIMA(history, order=(1, 1, 1)) model_fit = model.fit(disp=0) output = model_fit.forecast() yhat = output[0] predX.append(yhat) obs = testX[t] history.append(obs) #print('predicted=%f, expected=%f' % (yhat, obs)) test_error = math.sqrt(mean_squared_error(testX, predX)) print('Train RMSE: %.3f' % train_error) print('Test RMSE: %.3f' % test_error) # plot
def parser(x): return datetime.strptime('190'+x, '%Y-%m') series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) print(series.head()) series.plot() pyplot.show() autocorrelation_plot(series) pyplot.show() # fit model model = ARIMA(series, order=(5,1,0)) model_fit = model.fit(disp=0) print(model_fit.summary()) # plot residual errors residuals = DataFrame(model_fit.resid) residuals.plot() pyplot.show() residuals.plot(kind='kde') pyplot.show() print(residuals.describe()) # http://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.predict.html X = series.values size = int(len(X) * 0.66)
strt = "4/1/" + str(i) endt = "11/30/" + str(i) dat = pd.date_range(start=strt, end=endt, freq="D") L = [] for j in data.x: L += [j] dailyrain = pd.Series(L, index=dat) print dailyrain #test_stationarity(dailyrain) #plt.savefig("DailyGAURainfall" + str(i) + ".png") plt.close() model = ARIMA(dailyrain, order=orders[i - 2008]) results_AR = model.fit(disp=-1) plt.plot(dailyrain, Label="Daily Rainfall Data") results = results_AR.fittedvalues.apply(nonZ) plt.plot(results, color="RED", Label="Predicted Daily Rainfall") plt.title("RS: %.4f" % (sum((dailyrain - results)**2))) plt.legend(loc="best") plt.savefig("DailyGAU" + str(i) + str(orders[i - 2008]) + ".png") f.write("Parametersfor the year " + str(i) + "\n") ar_coef, ma_coef = results_AR.arparams, results_AR.maparams f.write("AR Coefficients: " + str(ar_coef) + "\n") f.write("MA Coefficients: " + str(ma_coef) + "\n") f.write("\n") p_values = [0, 1, 2, 3, 4] q_values = [0, 1, 2, 3, 4] d_values = [0]
def fit_models(mypath='', js=None): ''' Takes a file path with a file in json format, or a string with json structure Returns json (fecha, prediccion, error), error_prom, accuracy ''' #%% import os import time import datetime import numpy as np import pandas as pd import json from os import listdir from os.path import isfile, join from objdict import ObjDict #%% if mypath != '': os.chdir(mypath) lista_archivos = [f for f in listdir(mypath) if isfile(join(mypath, f))] lista_dat = [] for dat in lista_archivos: with open(dat) as json_data: lista_dat.append(json.load(json_data)) for i in range(0,len(lista_dat)): fechas = [] valores = [] for j in lista_dat[0]: fechas.append(j['fecha']) valores.append(j['valor']) elif js != None: fechas = [] valores = [] for i in js: fechas.append(i['fecha']) valores.append(i['valor']) #%% fechas_list = [fechas[x:x+1] for x in xrange(0, len(fechas), 1)] fechas_format = [] for date in fechas: #fechas_format.append(time.ctime(date/1000)) fechas_format.append(datetime.datetime.fromtimestamp(date/1000.0).strftime('%Y-%m-%d-%H')) #crear variables para separar fecha: ano, mes, dia, hora ano,mes,dia,hora = [],[],[],[] for date in fechas_format: fecha = date.split('-') ano.append(int(fecha[0])) mes.append(int(fecha[1])) dia.append(int(fecha[2])) hora.append(int(fecha[3])) #crear variables para dia de la semana dia_semana = [] for date in fechas: if time.ctime(date/1000).split()[0] == 'Mon': dia_semana.append(1) elif time.ctime(date/1000).split()[0] == 'Tue': dia_semana.append(2) elif time.ctime(date/1000).split()[0] == 'Wed': dia_semana.append(3) elif time.ctime(date/1000).split()[0] == 'Thu': dia_semana.append(4) elif time.ctime(date/1000).split()[0] == 'Fri': dia_semana.append(5) elif time.ctime(date/1000).split()[0] == 'Sat': dia_semana.append(6) elif time.ctime(date/1000).split()[0] == 'Sun': dia_semana.append(7) else: print 'Error' #crear vector fechas fechas_pandas = pd.to_datetime(fechas_format) #crear timeseries dframe = pd.Series(valores, index=fechas_pandas) #%% import pyflux as pf from datetime import datetime import matplotlib.pyplot as plt #%matplotlib inline #%% #Ver datos #plt.plot(dframe) #Eliminar Outliers dframe = dframe[~((dframe-dframe.mean()).abs()>3*dframe.std())] dframe= dframe[(dframe!=0)] #ver datos #plt.plot(dframe) #Separar en train, test features_train = dframe[0:int(len(dframe)*.9)] features_test = dframe[int(len(dframe)*.9)+1:len(dframe)] #ver datos #plt.plot(features_train) #plt.plot(features_test) #%% #probar stationarity from statsmodels.tsa.stattools import adfuller def test_stationarity(timeseries, plot=False): #Determing rolling statistics rolmean = pd.rolling_mean(timeseries, window=12) rolstd = pd.rolling_std(timeseries, window=12) #Plot rolling statistics: if plot: fig = plt.figure(figsize=(12, 8)) orig = plt.plot(timeseries, color='blue',label='Original') mean = plt.plot(rolmean, color='red', label='Rolling Mean') std = plt.plot(rolstd, color='black', label = 'Rolling Std') plt.legend(loc='best') plt.title('Rolling Mean & Standard Deviation') plt.show() print 'Results of Dickey-Fuller Test:' #Perform Dickey-Fuller test: dftest = adfuller(timeseries, autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value', '#Lags Used','Number of Observations Used']) for key,value in dftest[4].items(): dfoutput['Critical Value (%s)'%key] = value if plot: print dfoutput else: return dfoutput #%% #test_stationarity(features_train) ''' print 'Dickey-Fuller test for original data' test_stationarity(features_train, plot=True) ''' p_value = test_stationarity(features_train).iloc[1] #%% #Estimating & Eliminating Trend #Log Transformation features_train_log = np.log(features_train) #plt.plot(features_train_log) #test_stationarity(features_train_log) p_value_log = test_stationarity(features_train_log).iloc[1] #%% #First Differencing features_train_diff = features_train - features_train.shift(1) #plt.plot(features_train_diff) #Visualizar transformacion #plt.plot(features_train_diff) #plt.plot(features_train) features_train_diff.dropna(inplace=True) #test_stationarity(features_train_diff) p_value_diff = test_stationarity(features_train_diff).iloc[1] #%% #Second Differencing features_train_diff2 = features_train_diff - features_train_diff.shift(1) features_train_diff2.dropna(inplace=True) p_value_diff2 = test_stationarity(features_train_diff2).iloc[1] #%% #Differencing + log train_log_diff = features_train_log - features_train_log.shift(1) #plt.plot(dframe_log_diff) train_log_diff.dropna(inplace=True) #test_stationarity(train_log_diff) p_value_log_diff = test_stationarity(train_log_diff).iloc[1] #%% #Second Difference + Log train_log_diff2 = train_log_diff - train_log_diff.shift(1) #plt.plot(train_log_diff2) train_log_diff2.dropna(inplace=True) #test_stationarity(train_log_diff2) p_value_log_diff2 = test_stationarity(train_log_diff2).iloc[1] #%% #find best transformation p_value_list = [p_value, p_value_log, p_value_diff, p_value_log_diff, p_value_diff2, p_value_log_diff2] winner_index = p_value_list.index(min(p_value_list)) if winner_index == 0: winner = features_train if winner_index == 1: winner = features_train_log if winner_index == 2: winner = features_train_diff if winner_index == 3: winner = train_log_diff if winner_index == 4: winner = features_train_diff2 if winner_index == 5: winner = train_log_diff2 #%% #print 'Dickey-Fuller test for best transformation of data', #test_stationarity(winner, plot=True) #%% #Forecasting a Time Series #Arima - Auto-Regressive Integrated Moving Averages. ''' Number of AR (Auto-Regressive) terms (p): AR terms are just lags of dependent variable. For instance if p is 5, the predictors for x(t) will be x(t-1)….x(t-5). Number of MA (Moving Average) terms (q): MA terms are lagged forecast errors in prediction equation. For instance if q is 5, the predictors for x(t) will be e(t-1)….e(t-5) where e(i) is the difference between the moving average at ith instant and actual value. Number of Differences (d): These are the number of nonseasonal differences, i.e. in this case we took the first order difference. So either we can pass that variable and put d=0 or pass the original variable and put d=1. Both will generate same results. ''' #ACF and PACF plots: dframe_diff from statsmodels.tsa.stattools import acf, pacf lag_acf = acf(winner, nlags=20) lag_pacf = pacf(winner, nlags=20, method='ols') top_line = 1.96/np.sqrt(len(winner)) #%% #Get best q and p. Not optimized ''' q=0 for i in lag_acf: if i > top_line: q+=1 else: break p=0 for i in lag_pacf: if i > top_line: p+=1 else: break ''' #%% ''' #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(winner)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(winner)),linestyle='--',color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(winner)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(winner)),linestyle='--',color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout() plt.show() ''' #%% ''' print('Enter the value of q, corresponding to the ACF graph') q = raw_input() print('Enter the value of p, corresponding to the PACF graph') p = raw_input() q = int(q) p = int(p) ''' #In this plot, the two dotted lines on either sides of 0 are the #confidence interevals. These can be used to determine the ‘p’ and ‘q’ values as: ''' p – The lag value where the PACF chart crosses the upper confidence interval for the first time. In this case p=2. q – The lag value where the ACF chart crosses the upper confidence interval for the first time. In this case q=6. ''' #%% #Model (p,d,q) #Finding best parameters from statsmodels.tsa.arima_model import ARIMA acc_list = [] for d in range(0,3): for p in range(0,6): for q in range(0,6): #print('Model Result') try: model_diff = ARIMA(winner, order=(p, d, q)) results_ARIMA_diff = model_diff.fit(disp=-1) error = np.sqrt((results_ARIMA_diff.fittedvalues-winner[1:])**2) error_prom = error.mean() accuracy = 100-error_prom acc_list.append([p, d, q, accuracy]) except: next #plt.plot(winner) #plt.plot(results_ARIMA_diff.fittedvalues, color='red') #plt.title('RSS: %.4f'% sum((results_ARIMA_diff.fittedvalues-winner)**2)) #plt.show() from operator import itemgetter params = sorted(acc_list, key=itemgetter(3))[len(acc_list)-1] p = params[0] d = params[1] q = params[2] #%% #Build model with best params model_diff = ARIMA(winner, order=(p, d, q)) results_ARIMA_diff = model_diff.fit(disp=-1) ''' plt.plot(winner) plt.plot(results_ARIMA_diff.fittedvalues, color='red') plt.title('RSS: %.4f'% sum((results_ARIMA_diff.fittedvalues-winner[2:])**2)) plt.show() error = np.sqrt((results_ARIMA_diff.fittedvalues-winner)**2) error_prom = error.mean() accuracy = 100-error_prom ''' #%% #Taking it back to original scale #store the predicted results as a separate series and observe it. if winner_index == 0: predictions_ARIMA = pd.Series(results_ARIMA_diff.fittedvalues, copy=True) pred_ARIMA_diff_corrected = predictions_ARIMA if winner_index == 1: predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True) pred_ARIMA_diff_corrected = np.exp(predictions_ARIMA_diff) if winner_index == 2: predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True) pred_ARIMA_diff_corrected = predictions_ARIMA_diff + features_train.shift(1) pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[1:] if winner_index == 3: predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True) pred_ARIMA_diff_corrected = np.exp(predictions_ARIMA_diff) pred_ARIMA_diff_corrected = predictions_ARIMA_diff + features_train.shift(1) pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[1:] if winner_index == 4: predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True) pred_ARIMA_diff_corrected = predictions_ARIMA_diff + features_train.shift(1) pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[2:] pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected.shift(-2) pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[:len(pred_ARIMA_diff_corrected)-2] if winner_index == 5: predictions_ARIMA_diff = pd.Series(results_ARIMA_diff.fittedvalues, copy=True) pred_ARIMA_diff_corrected = np.exp(predictions_ARIMA_diff) pred_ARIMA_diff_corrected = predictions_ARIMA_diff + features_train.shift(1) pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[2:] pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected.shift(-1) pred_ARIMA_diff_corrected = pred_ARIMA_diff_corrected[:len(pred_ARIMA_diff_corrected)-1] #%% #Visualize in-sample predictions ''' plt.plot(features_train) plt.plot(pred_ARIMA_diff_corrected, color='red') ''' #%% ''' plt.plot(pred_ARIMA_diff_corrected.head(100), color='red') plt.plot(features_train.head(100)) ''' #%% ''' plt.plot(pred_ARIMA_diff_corrected.tail(100), color='red') plt.plot(features_train.tail(100)) ''' #%% #visualizar error ''' print('Percentage of Errors') in_sample_error = np.sqrt((pred_ARIMA_diff_corrected-features_train)**2) in_sample_error_prom = error.mean() in_sample_accuracy = 100-error_prom plt.plot(in_sample_error) plt.title('Promedio Error: %.4f'% in_sample_error_prom + '; Precision: %.4f'% in_sample_accuracy) plt.show() ''' #%% #Out of sample predictions if winner_index == 0: out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True) if winner_index == 1: out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True) out_of_sample_predictions_ARIMA = np.exp(out_of_sample_predictions_ARIMA) if winner_index == 2: out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True) out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[2:len(out_of_sample_predictions_ARIMA)-2] out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA + features_train.tail(len(features_test)).values out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA.shift(-2) out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[:len(out_of_sample_predictions_ARIMA)-2] if winner_index == 3: out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True) out_of_sample_predictions_ARIMA = np.exp(out_of_sample_predictions_ARIMA) out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[2:len(out_of_sample_predictions_ARIMA)-2] out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA + features_train.tail(len(features_test)).values out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA.shift(-2) out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[:len(out_of_sample_predictions_ARIMA)-2] if winner_index == 4: out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True) out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[2:len(out_of_sample_predictions_ARIMA)-2] out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA + features_train.tail(len(features_test)).values out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA.shift(-5) out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[:len(out_of_sample_predictions_ARIMA)-4] out_of_sample_predictions_ARIMA.index = features_test.head(len(out_of_sample_predictions_ARIMA)).index if winner_index == 5: out_of_sample_predictions_ARIMA = results_ARIMA_diff.predict(start=features_train.tail(1).index[0], end = len(features_train)+len(features_test-2), dynamic=True) out_of_sample_predictions_ARIMA = np.exp(out_of_sample_predictions_ARIMA) out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[2:len(out_of_sample_predictions_ARIMA)-2] out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA + features_train.tail(len(features_test)).values out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA.shift(-4) out_of_sample_predictions_ARIMA = out_of_sample_predictions_ARIMA[:len(out_of_sample_predictions_ARIMA)-4] #%% #Error total ''' print('Out of sample prediction') plt.plot(features_test, color='green') plt.plot(out_of_sample_predictions_ARIMA, color='red') ''' #visualizar error error = np.sqrt((out_of_sample_predictions_ARIMA-features_test)**2).head(len(out_of_sample_predictions_ARIMA)) error_prom = error.mean() accuracy = 100-error_prom ''' plt.plot(error) plt.title('Promedio Error: %.4f'% error_prom + '; Precision: %.4f'% accuracy) plt.show() ''' #%% #Error primeros 50 datos ''' print('Out of sample prediction First 50') plt.plot(features_test.head(50), color='green') plt.plot(out_of_sample_predictions_ARIMA.head(50), color='red') error_50 = np.sqrt((out_of_sample_predictions_ARIMA.head(50)-features_test.head(50))**2).head(len(out_of_sample_predictions_ARIMA.head(50))) error_prom_50 = error_50.mean() accuracy_50 = 100-error_prom_50 plt.plot(error_50) plt.title('Promedio Error: %.4f'% error_prom_50 + '; Precision: %.4f'% accuracy_50) plt.show() ''' #%% data = [] for i in range(0, len(error)): entry = ObjDict() entry.fecha = str(out_of_sample_predictions_ARIMA.index[i]) entry.prediccion = out_of_sample_predictions_ARIMA[i] entry.error = error[i] data.append(entry) #%% #print('data, RMSE, error_prom, accuracy') return json.dumps(data), error_prom, accuracy
def ARIMAmodel(data, days=0): from statsmodels.tsa.arima_model import ARIMA model = ARIMA(data, order=(1, 1, 1)) model_fit = model.fit(disp=False) yhat = model_fit.predict(len(data), len(data) + days, typ='levels') return (yhat)
plt.show() from statsmodels.graphics.tsaplots import plot_acf from statsmodels.graphics.tsaplots import plot_pacf fig = plt.figure(figsize=(10, 8)) ax1 = fig.add_subplot(211) fig = plot_acf(diffshift, lags=40, ax=ax1) ax2 = fig.add_subplot(212) fig = plot_pacf(diffshift, lags=40, ax=ax2) plt.show() #arima from statsmodels.tsa.arima_model import ARIMA model = ARIMA(indxds_logscale, order=(3, 1, 0)) results = model.fit(disp=0) print(results.summary()) plt.plot(diffshift, color='g') plt.plot(results.fittedvalues, color='b') #plt.title('RSS: %.4f'% sum((results.fittedvalues-diffshift['Close'])**2)) #### x = results.forecast(steps=15)[0] fore = np.exp(x) z = fore.tolist() price1 = pd.concat( [pd.Series(df['Close']), pd.Series(z)], ignore_index=True, copy=True) print(price1.tail()) fig, ax = plt.subplots(1, 1) price1.plot(ax=ax, color='k', label='actual') price1.iloc[53:].plot(ax=ax, color='r', label='forecasted') plt.xlabel('index')
model_fit = ARIMAResults.load('model.pkl') bias = numpy.load('model_bias.npy') # make first prediction predictions = list() yhat = float(model_fit.forecast()[0]) yhat = bias + inverse_difference(history, yhat, months_in_year) predictions.append(yhat) history.append(y[0]) print('>Predicted=%.3f, Expected=%3.f' % (yhat, y[0])) # rolling forecasts for i in range(1, len(y)): # difference data months_in_year = 12 diff = difference(history, months_in_year) # predict model = ARIMA(diff, order=(6, 0, 0)) model_fit = model.fit(trend='nc', disp=0) yhat = model_fit.forecast()[0] yhat = bias + inverse_difference(history, yhat, months_in_year) predictions.append(yhat) # observation obs = y[i] history.append(obs) print('>Predicted=%.3f, Expected=%3.f' % (yhat, obs)) # report performance mse = mean_squared_error(y, predictions) rmse = sqrt(mse) print('RMSE: %.3f' % rmse) pyplot.plot(y) pyplot.plot(predictions, color='red') pyplot.show()
def use_arima(self, training_data, p, d, q): model = ARIMA(training_data, order=(p, d, q)) model_fit = model.fit(disp=False) return model_fit.forecast()[0]
def global_forcast(): dataset = pd.read_csv("data_formatted.csv") forecasting_dataset = pd.read_csv("forecasting.csv") """ We are creating matrix of independent variable and vector of dependent variable """ X = dataset.iloc[:, 0:1].values Y = dataset.iloc[:, 1:2].values X1 = forecasting_dataset.iloc[:, 0:1].values Y1 = forecasting_dataset.iloc[:, 1:2].values # ============================================================================= # Y = dataset.iloc[:,3].values """ Here we will check for exitance of any missing value and replace that missing value by mean of the column """ imputer = Imputer(missing_values='NaN', strategy='mean', axis=0) Y = imputer.fit_transform(Y) Y = imputer.transform(Y) """ Here we are going to convert years to some label as numeric calculations can't be performed on labels """ #from sklearn.preprocessing import LabelEncoder, OneHotEncoder # #labelencoder_X = LabelEncoder() #X[:,0]=labelencoder_X.fit_transform(X[:,0]) """ We don't need hot encoding yet because years do have certain weightage """ """ This is the most crucial step of data preprocessing.Here we are splitting our dataset into training and test dataset to avoid overfitting.Here we have choosen random_state of 42 which is generally most suitable state for unbiased division. """ #Training our model on training set data X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) print("Global Oil consumption forecasting using ARIMA model") model = ARIMA(X_train, order=(1, 1, 0)) model_fit = model.fit(disp=0) print(model_fit.summary()) # plot residual errors residuals = DataFrame(model_fit.resid) residuals.plot() pyplot.show() residuals.plot(kind='kde') pyplot.show() print(residuals.describe()) #Visualizing the result on test set X_test_list = [] for x in X_test.flat: X_test_list.append(x) Y_pred = [3320, 3400, 3893, 3895] Y_forecast = [4394.190, 4421.864, 4719.0507607, 4628.7790, 5074.080] X_forecast = [] for x in X1.flat: X_forecast.append(x) plt.scatter(X_test, Y_test, color="red") plt.plot(X_test_list, Y_pred, color="blue") plt.title("Year vs Consumption (Test set)") plt.xlabel("Year of Consumption") plt.ylabel("Total Consumption") locator = matplotlib.ticker.MultipleLocator(2) plt.gca().xaxis.set_major_locator(locator) formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}") plt.gca().xaxis.set_major_formatter(formatter) plt.show() plt.scatter(X_test, Y_test, color="red") plt.bar(X_test_list, Y_pred) plt.title("Year vs Consumption (Test set)") plt.xlabel("Year of Consumption") plt.ylabel("Total Consumption") locator = matplotlib.ticker.MultipleLocator(1) plt.gca().xaxis.set_major_locator(locator) formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}") plt.gca().xaxis.set_major_formatter(formatter) plt.show() #Forecasting the result for next five years plt.plot(X_forecast, Y_forecast, color="blue") plt.title("Year vs Consumption (Forecasting result)") plt.xlabel("Year of Consumption") plt.ylabel("Total Consumption") locator = matplotlib.ticker.MultipleLocator(2) plt.gca().xaxis.set_major_locator(locator) formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}") plt.gca().xaxis.set_major_formatter(formatter) plt.show() plt.bar(X_forecast, Y_forecast) plt.title("Year vs Consumption (Forecasting result)") plt.xlabel("Year of Consumption") plt.ylabel("Total Consumption") locator = matplotlib.ticker.MultipleLocator(1) plt.gca().xaxis.set_major_locator(locator) formatter = matplotlib.ticker.StrMethodFormatter("{x:.0f}") plt.gca().xaxis.set_major_formatter(formatter) plt.show()
def startARIMAForecasting(dataset, P, D, Q, newdates): model = ARIMA(dataset, order=(P, D, Q)) model_fit = model.fit(disp=0) prediction = model_fit.forecast(len(newdates))[0] return prediction
# Import the ARIMA module from statsmodels from statsmodels.tsa.arima_model import ARIMA # Forecast temperatures using an ARIMA(1,1,1) model mod = ARIMA(temp_NY, order=(1, 1, 1)) res = mod.fit() # Plot the original series and the forecasted series res.plot_predict(start='1872-01-01', end='2046-01-01') plt.show()
def run_main(): k = ts.get_hist_data('600519') #600519茅台股票 这里可以设置获取的时间段 # k = ts.get_hist_data('600519',start='2015-05-04',end='2018-05-02') lit = ['open', 'high', 'close', 'low'] #这里我们只获取其中四列 data = k[lit] d_one = data.index #以下9行将object的index转换为datetime类型 d_two = [] d_three = [] date2 = [] for i in d_one: d_two.append(i) for i in range(len(d_two)): d_three.append(parse(d_two[i])) data2 = pd.DataFrame(data, index=d_three, dtype=np.float64) #构建新的DataFrame赋予index为转换的d_three。当然你也可以使用date_range()来生成时间index plt.plot(data2['close']) #一看数据就不稳定,所以我们需要做差分 plt.title('股市每日收盘价') plt.show() data2_w = data2['close'].resample( 'W-MON').mean() #由于原始数据太多,按照每一周来采样,更好预测,并取每一周的均值 data2_train = data2_w['2015':'2017'] #我们只取2015到2017的数据来训练 plt.plot(data2_train) plt.title('周重采样数据') plt.show() #一阶差分,分析ACF acf = plot_acf(data2_train, lags=20) #通过plot_acf来查看训练数据,以便我们判断q的取值 plt.title("股票指数的 ACF") acf.show() #一阶差分,分析PACF pacf = plot_pacf(data2_train, lags=20) #通过plot_pacf来查看训练数据,以便我们判断p的取值 plt.title("股票指数的 PACF") pacf.show() #处理数据,平稳化处理 data2_diff = data2_train.diff(1) #差分很简单使用pandas的diff()函数可以进行一阶差分 diff = data2_diff.dropna() for i in range(4): #五阶差分,一般一到二阶就行了,我有点过分 diff = diff.diff(1) diff = diff.dropna() plt.figure() plt.plot(diff) plt.title('五阶差分') plt.show() # 五阶差分的ACF acf_diff = plot_acf(diff, lags=20) plt.title("五阶差分的ACF") #根据ACF图,观察来判断q acf_diff.show() # 五阶差分的PACF pacf_diff = plot_pacf(diff, lags=20) #根据PACF图,观察来判断p plt.title("五阶差分的PACF") pacf_diff.show() print("train sample data") print(data2_train.head()) #根据ACF和PACF以及差分 定阶并建模 model = ARIMA(data2_train, order=(6, 1, 5), freq='W-MON') #pdq 频率按周 #拟合模型 arima_result = model.fit() #预测 pred_vals = arima_result.predict( '2017-01-02', dynamic=True, typ='levels') #输入预测参数,这里我们预测2017-01-02以后的数据 #可视化预测 stock_forcast = pd.concat([data2_w, pred_vals], axis=1, keys=['original', 'predicted']) #将原始数据和预测数据相结合,使用keys来分层 #构图 plt.figure() plt.plot(stock_forcast) plt.title('真实值vs预测值') plt.show()
class arima(): """ARIMA class object""" def __init__( self, df, col_name, latest_date, obs_num, lstm_pred, order=(0, 2, 1), pred_num=30, train_num=200, ): self.df = df self.col = col_name self.series = df[col_name] self.date = latest_date self.order = order self.obs_num = obs_num self.pred_num = pred_num self.train_num = train_num self.lstm_pred = lstm_pred best_cfg = load(open('_models/arima_order.pkl', 'rb')) try: order = best_cfg[col_name] except: order = (0, 2, 1) self.model = ARIMA(self.series[-1 * train_num:], order=order) self.model_fit = self.model.fit(disp=0) def quick_fit_plot(self): """create streamlit plot object""" st.pyplot(self.model_fit.plot_predict(1, self.obs_num + self.pred_num)) def plot_acf_pacf(self): """Auto correlation function plot on streamlit object""" fig, axs = plt.subplots(2) plt.subplots_adjust(hspace=0.4) plot_acf(self.series[-1 * self.train_num:], ax=axs[0]) plot_pacf(self.series[-1 * self.train_num:], ax=axs[1]) st.pyplot(fig) def get_pred(self): trend = self.model_fit.forecast(self.pred_num)[0] conf_inv = self.model_fit.forecast(self.pred_num)[2] return trend, conf_inv def _create_df_plot(self, col_type='Cases', arima_on=True, lstm_on=True): """create plot data frame of prediction and confidence region for altair plot""" localize = lambda x: "{:,}".format(round(x)) # origin data temp = pd.DataFrame(self.series[-1 * self.obs_num:]).rename( columns={self.col: 'cases'}) temp['Date'] = temp.index temp['Type'] = col_type temp['Cases'] = temp['cases'].apply(localize) # predictions if arima_on: line = pd.DataFrame(self.model_fit.forecast(self.pred_num)[0], index=pd.date_range(start=self.date + dt.timedelta(days=1), periods=self.pred_num), columns=['cases']) line['Date'] = line.index line['Type'] = 'ARIMA_pred' line['Cases'] = line['cases'].apply(localize) temp = temp.append(line, ignore_index=True) # Predictions2 if lstm_on: lstm_line = pd.DataFrame( self.lstm_pred[self.col]).rename(columns={self.col: 'cases'}) lstm_line['Date'] = lstm_line.index lstm_line['Type'] = 'LSTM_pred' lstm_line['Cases'] = lstm_line['cases'].apply(localize) temp = temp.append(lstm_line, ignore_index=True) # confidence region cl = pd.DataFrame(self.model_fit.forecast(self.pred_num)[2], index=pd.date_range(start=self.date + dt.timedelta(days=1), periods=self.pred_num), columns=['lower', 'upper']) cl['Date'] = cl.index return temp, cl def draw_single_trend(self, return_chart=False, country_name='Cases', arima_on=True, lstm_on=True): df_plot, df_cl = self._create_df_plot(country_name, lstm_on=lstm_on) trend_chart = alt.Chart(df_plot).mark_line().encode( x=alt.X("Date:T", scale=alt.Scale(zero=False)), y=alt.Y("cases:Q", scale=alt.Scale(zero=False)), color=alt.Color('Type', sort=[country_name, 'ARIMA_pred', "LSTM_pred"]), strokeDash=alt.condition( alt.FieldOneOfPredicate(field='Type', oneOf=['ARIMA_pred', 'LSTM_pred']), #((alt.datum.Type == 'ARIMA_pred') or (alt.datum.Type == 'LSTM_pred')), alt.value([10, 5]), # dashed line: 5 pixels dash + 5 pixels space alt.value([0])), tooltip=["Date:T", "Cases:O"]).properties(width=800, height=300).interactive() band = alt.Chart(df_cl).mark_area(opacity=0.5, color='grey').encode( x=alt.X("Date:T", scale=alt.Scale(zero=False)), y=alt.Y('lower', title='cases'), y2=alt.Y2('upper', title='cases')).properties(width=800, height=300).interactive() if return_chart: return band + trend_chart else: st.altair_chart(band + trend_chart)
from pandas import datetime from matplotlib import pyplot from statsmodels.tsa.arima_model import ARIMA from sklearn.metrics import mean_squared_error import numpy as np series = [O[i][j]['rainfall'] for j in range(20) for i in range(52)] X = np.sqrt(series) size = int(len(X) * 0.66) train, test = X[0:780], X[780:len(X)] history = [x for x in train] predictions = list() for t in range(len(test)): model = ARIMA(history, order=(5, 2, 0)) model_fit = model.fit(disp=0, ) output = model_fit.forecast() yhat = output[0] predictions.append(yhat) obs = test[t] history.append(obs) print('predicted=%f, expected=%f' % (yhat, obs)) error = mean_squared_error(test, predictions) print('Test MSE: %.3f' % error) # plot pyplot.plot(test) pyplot.plot(predictions, color='red') pyplot.show()
class Kappa(object): def __init__(self, fname, cvparams=True): ''' Input: file path to dataframe to be used for analysis Output: Kappa object with cleaned dataframe ''' self.fname = fname temp = pickle.load(open(fname)) self.df = add_ngames_col(nstreams_filter(600,temp)) self.df.set_index('date', inplace=True) self.df['dayofweek'] = self.df.index.dayofweek self.df['weekofyear'] = self.df.index.weekofyear self.df['year'] = self.df.index.year # self.rfr_params = {'n_estimators':300, # 'max_features':'sqrt', # 'n_jobs':-1} if cvparams==True: self.cvparams = pickle.load(open('pickle_pile/cross_val_smorc.pkl','rb')) self.cv_pred = {} def cfilt(self,channel): ''' Input: channel to filter for Output: channel-specific dataframe ''' self.cdf=chan_filter(self.df, channel) self.cdf.sort_index(inplace=True) def dumb_set(self,dft): ''' Input: Initial dataframe Output: X, y for machine learning algorithms with dummified categorical variables. ''' # for i in np.arange(7): # dft['lagday{0}'.format(i+1)] = (dft["AVG CCV's"].shift((i+1))).fillna(0) y = dft["AVG CCV's"] dc = ['AirTime','Platform', 'tdelta','avg_frequency','Language', 'index','#', "AVG CCV's", "Max CCV's", 'Hours Watched']# 'Hours Watched', 'Channel', 'Main Game' X = dft.drop(dc, axis=1) X = pd.get_dummies(X, columns = ['Channel', 'Main Game', 'dayofweek']) #eventually add Language return X, y def _make_holdout_split(self, df, leaveout=3): # ''' Input: dataframe and # leaveout weeks. Output: X,y training and hold data partitions ''' # self.folds = pd lod = leaveout*7 start, end = df.index.min(), df.index.max() self.folds = pd.date_range(start, end, freq='7D')#'{0}D'.format(lod)) self.Xset, self.yset = self.dumb_set(df) lo = self.folds[-leaveout:][0] X_trainset = self.Xset.query('date < @lo') y_trainset = self.yset.reset_index().query('date < @lo') X_holdset = self.Xset.query('date >= @lo') y_holdset = self.yset.reset_index().query('date >= @lo') self.X_trainset = X_trainset.copy() self.y_trainset = y_trainset.copy().set_index('date') self.X_holdset = X_holdset.copy() self.y_holdset = y_holdset.copy().set_index('date') self.X_train = self.X_trainset.reset_index().copy() self.y_train = self.y_trainset.reset_index().copy() self.X_hold = self.X_holdset.reset_index().copy() self.y_hold = self.y_holdset.reset_index().copy() def _fchain_kfold_indicies(self, lag=1, ahead=1): ''' Input: lag weeks, ahead weeks Output: forward chain kfold cross validation indices for time series. ''' #currently avoiding dummy problems by dummifying early, need to fix # and adapt later down the road. Also add cols ld = pd.Timedelta(days=lag*7) ad = pd.Timedelta(days=ahead*7) kstart, kend = self.X_trainset.index.min(), self.X_trainset.index.max() period = lag*7 + ahead*7 self.kfolds = pd.date_range(kstart,kend, freq='{0}D'.format(period)) self.train_kfoldi = [] self.test_kfoldi = [] self.fkfoldi = [] # self.kfoldxi = [] # self.kfoldyi = [] for i, f in enumerate(self.kfolds): j = 1+i if f==kstart: #For first fold udb = self.kfolds[1] - ad train_xset = self.X_train.query('date < @udb') train_yset = self.y_train.query('date < @udb') test_yset = self.y_train.query('date >= @udb & date < @self.kfolds[1]') test_xset = self.X_train.query('date >= @udb & date < @self.kfolds[1]') elif i == len(self.kfolds)-1: #For last fold udb = kend - ad train_yset = self.y_train.query('date < @udb') train_xset = self.X_train.query('date < @udb') test_xset = self.X_train.query('date >= @udb') test_yset = self.y_train.query('date >= @udb') else: #middle folds udb = self.kfolds[j]-ad train_xset = self.X_train.query('date < @udb') train_yset = self.y_train.query('date < @udb') test_xset = self.X_train.query('date >= @udb & date < @self.kfolds[@j]') test_yset = self.y_train.query('date >= @udb & date < @self.kfolds[@j]') self.testx_ind = test_xset.index.values self.testy_ind = test_yset.index.values self.trainx_ind = train_xset.index.values self.trainy_ind = train_yset.index.values self.train_kfoldi.append([self.trainx_ind, self.trainy_ind]) self.test_kfoldi.append([self.testx_ind, self.testy_ind]) self.fkfoldi.append((self.trainx_ind, self.testx_ind)) self.Xtrn = self.X_train.drop('date', axis=1) self.ytrn = self.y_train.drop('date', axis=1) # self.Xhld = self.X_hold.drop('date', axis=1) # self.yhld = self.yhld.drop('date', axis=1) def run_cvmod(self, channel): ''' DEPRECATED: Originally used for testing/debugging ''' self.cfilt(channel) # self.cdf.sort_index(inplace=True) self._make_holdout_split(mk.cdf) self._fchain_kfold_indicies() # self.Xtrn = self.X_train.drop('date', axis=1) # self.ytrn = self.y_train.drop('date', axis=1) self.rfrcv = RandomForestRegressor(**self.rfr_params) mses = [] r2s = [] for j in xrange(len(self.kfolds)): print '******' print 'Evaluating Fold #{0}'.format(j) print '******' Xtrain_indices, ytrain_indices = self.train_kfoldi[j] Xtest_indices, ytest_indices = self.test_kfoldi[j] xtrain = self.Xtrn.iloc[Xtrain_indices] ytrain = self.ytrn.iloc[ytrain_indices] xtest = self.Xtrn.iloc[Xtest_indices] ytest = self.ytrn.iloc[ytest_indices] self.rfrcv.fit(xtrain.values, ytrain.values) ypred = self.rfrcv.predict(xtest) mses.append(mean_squared_error(ytest.values, ypred)) r2s.append(r2_score(ytest.values, ypred)) self.rmses, self.r2_scores = np.sqrt(mses), np.array(r2s) def test_stationarity(self,channel): ''' Input: channel for testing Output: Results of Dickey-Fuller test and plot with data, rolling mean, and rolling std. ''' #requires date index ts = chan_filter(self.df, channel) ts.sort_index(inplace=True) timeseries = ts["AVG CCV's"] rolmean = pd.Series.rolling(timeseries, window=7).mean() rolstd = pd.Series.rolling(timeseries, window=7).std() orig = plt.plot(timeseries, color='blue', label='Original') mean = plt.plot(rolmean, color='red', label='Rolling Mean') std = plt.plot(rolstd, color = 'black', label='Rolling std') plt.legend(loc='best') plt.show(block=False) print 'Results of Dickey-Fuller Test' dftest = adfuller(timeseries, autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=['Test Statistic', 'p-value', '#Lags Used', 'Number Observations Used']) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value print dfoutput def plot_acf_pacf(self, channel, lags=20): ''' Input: channel and #lags to include Output: Plots with autocorrelation function and partial autocorrelation function. ''' #set indexto date in input ts = chan_filter(self.df, channel) ts.sort_index(inplace=True) data = ts["AVG CCV's"] fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) fig = plot_acf(data, lags=lags, ax=ax1) ax2 = fig.add_subplot(212) fig = plot_pacf(data, lags=lags, ax=ax2) plt.show() def pltrange(self):#, indx_range=None): ''' DEPRECATED: Used for early EDA, data-viz and results analysis ''' ypred = self.rfrcv.predict(self.Xtrn) # if indx_range != None: # plt.plot(ypred[indx_range]) # plt.plot(self.ytrn.values[indx_range]) # plt.show(block=False) # else: plt.plot(ypred) plt.plot(self.ytrn.values) plt.show(block=False) def run_grid_search(self, estimator): ''' Input: estimator name Output: best parameters for a given estimator ''' if estimator.__class__.__name__ == 'RandomForestRegressor': self.gridsearch = GridSearchCV(estimator, self.rfr_gsparams, n_jobs=-1, verbose=True, scoring='mean_squared_error', cv=self.fkfoldi) #have this functionr return best params then pass those as argument to cross_val_score and loop through different channels elif estimator.__class__.__name__ == 'GradientBoostingRegressor': self.gridsearch = GridSearchCV(estimator, self.gboostR_gsparams, n_jobs=-1, verbose=True, scoring='mean_squared_error', cv=self.fkfoldi) self.gridsearch.fit(self.Xtrn, self.ytrn) print self.gridsearch.best_params_ print 'for ', estimator.__class__.__name__ def eval_models(self, channel):#deprecated, cvpredict needs partitions ''' DEPRECATED: cvpredict requires full partitions of cross val indices. Was attempting to simplify code however forward chain cross val not compatible with cvpredict. ''' self.cfilt(channel) self._make_holdout_split(self.cdf) self._fchain_kfold_indicies() lassoCV_params = {'cv': self.fkfoldi, 'n_jobs':-1, 'alphas':np.logspace(-4,2,100)} ridgeCV_params = {'cv': self.fkfoldi, 'alphas':np.logspace(-4,2,100), 'scoring':'mean_squared_error'} models = [RandomForestRegressor(**self.cvparams[channel]['RandomForestRegressor']['params']), GradientBoostingRegressor(**self.cvparams[channel]['GradientBoostingRegressor']['params']), LassoCV(**lassoCV_params), RidgeCV(**ridgeCV_params)] self.cv_pred[channel] = {} for mod in models: self.cv_pred[channel][mod.__class__.__name__] = cross_val_predict(estimator=mod, X=self.Xtrn, y=self.ytrn, cv=self.fkfoldi, n_jobs=-1) pickle.dump(self.cv_pred[channel], open('pickle_pile/{0}_cvpred.pkl'.format(channel),'wb')) def linear_kappa_search(self): ''' DEPRECATED: more efficient method used elsewhere. Input: Output: ''' channels = ['lirik', 'summit1g', 'imaqtpie', 'nl_kripp', 'destiny'] self.lcvp = {} for channel in channels: self.lcvp[channel] = {} self.cfilt(channel) self._make_holdout_split(self.cdf) self._fchain_kfold_indicies() lassoCV_params = {'cv': self.fkfoldi, 'n_jobs':-1, 'alphas':np.logspace(-4,2,100)} ridgeCV_params = {'cv': self.fkfoldi, 'alphas':np.logspace(-4,2,100), 'scoring':'mean_squared_error'} models = [LassoCV(**lassoCV_params), RidgeCV(**ridgeCV_params)] for regression in models: reg_name = regression.__class__.__name__ self.lcvp[channel][reg_name] = {} regression.fit(self.Xtrn, self.ytrn) self.lcvp[channel][reg_name][alpha] = regression.alpha_ mse_scores = cross_val_score(estimator=regression, X=self.Xtrn, y=self.ytrn, scoring='mean_squared_error', n_jobs=-1) self.lcvp[channel][reg_name][rmse] = np.sqrt(mse_score).mean() def kappa_search(self, channel, estimator): ''' Input: Channel for which to optimize model, estimator for model. Output: Gridsearch on estimator using parameters defined in function. ''' self.cfilt(channel) self._make_holdout_split(self.cdf) self._fchain_kfold_indicies() self.rfr_gsparams = {'n_estimators': [10, 100, 200, 300], 'criterion': ['mse'], 'min_samples_split': [2,4,6], 'min_samples_leaf': [1,2], 'max_features': ['sqrt', None, 'log2'], 'n_jobs':[-1] } self.gboostR_gsparams = {'loss': ['ls','lad','huber'], 'learning_rate': [.001, .01, .1, 1, 2], 'n_estimators': [50, 100, 200], 'max_depth': [2,5,8,10], 'max_features': [None,'sqrt','log2'] } self.xts = self.X_train.set_index('date') self.yts = self.y_train.set_index('date') self.arima_params = {'endog': self.yts, 'order': (2,1,2)} self.run_grid_search(estimator) def load_newh(self): ''' Due to time-lapse in data collection and analysis, new data had been acquired that could be analyzed. Output: New dataframe containing completely unseen data. ''' temp = pickle.load(open('pickle_pile/dfg.pkl', 'rb')) dfg = nstreams_filter(600, temp) dfg.set_index('date', inplace=True) dfg['year']=dfg.index.year return dfg def _find_holdout_date_thresh(self,channel): ''' Input: channel Output: date range of holdout set ''' self.cfilt(channel) self._make_holdout_split(self.cdf) return self.X_hold['date'].min(), self.cdf.index.max() def eval_holdout_data(self, channel): ''' Used after adding freshly collected data. Evaluates models using previously gridsearch-optimized estimators. Currently, because of dummy variables, these need to be created early in the process to ensure proper dimensionality of categorical features. This step is not necessary if used in graphlab due its superious handling of categorical variables. Further, creation of dummie dictionary using training set and then adding dummy columns to holdout data encoded by dummy dictionary also works. ''' dhmin, dhmax = self._find_holdout_date_thresh(channel) self.dfn = chan_filter(self.load_newh(), channel) self.dfn.sort_index(inplace=True) dft = self.dfn.query('date > @dhmax') self.dfu = pd.concat([self.cdf,dft]) new_hold_num = self.dfu.query('date >= @dhmin').shape[0] lon = new_hold_num/7 # dd = (self.X_hold.shape[0] + dft.shape[0])/7 self._make_holdout_split(self.dfu, leaveout=lon) self._fchain_kfold_indicies() lassoCV_params = {'cv': self.fkfoldi, 'n_jobs':-1, 'alphas':np.logspace(-4,2,100)} ridgeCV_params = {'cv': self.fkfoldi, 'alphas':np.logspace(-4,2,100), 'scoring':'mean_squared_error'} models = [RandomForestRegressor(**self.cvparams[channel]['RandomForestRegressor']['params']), GradientBoostingRegressor(**self.cvparams[channel]['GradientBoostingRegressor']['params']), LassoCV(**lassoCV_params), RidgeCV(**ridgeCV_params)] xh = self.X_hold.drop('date', axis=1) yh = self.y_hold.drop('date', axis=1).values plt.figure(figsize=(16,10)) plt.plot(yh, label='True Values', color='black') for mod in models: mod.fit(self.Xtrn, self.ytrn) mod_name = mod.__class__.__name__ ypred = mod.predict(xh) mses = mean_squared_error(yh, ypred) rmse = np.sqrt(mses).mean() plt.plot(ypred, label = 'rmse: {0}, for model: {1}'.format(rmse, mod_name)) plt.legend(loc='best') plt.show(block=False) def SMOrc_findbest(self): ''' Output: pickled dictionary containing optimized model parameters for a specific channel. Runs gridsearch for optimal parameters for each model, for each channel. This is not ideal, but currently best option due to dramatic differences between individual channels. Further work involves creation/optimization of general model that will not be tailored for a specific channel. Fun Fact: Variable name comes from twitch.tv emote that depicts a brutish orc, representing the brute force method of optimization used. ''' self.prep_arima() #Has to be done with dummie info so won't get error when instatiating in list with no input arguments models = [RandomForestRegressor(), GradientBoostingRegressor(), ARIMA(**self.arima_params)] channels = ['lirik', 'summit1g', 'imaqtpie', 'nl_kripp', 'destiny', 'admiral_bahroo'] self.cvscores = {} for channel in channels: self.cvscores[channel] = {} for model in models: print 'Running ',model.__class__.__name__, ' for channel: ', channel self.kappa_search(channel, model) self.cvscores[channel][model.__class__.__name__] = {} if model.__class__.__name__ != 'ARIMA': self.cvscores[channel][model.__class__.__name__]['params'] = self.gridsearch.best_params_ # self.mod = model(**self.gridsearch.best_params_) mod = self.gridsearch.best_estimator_ self.cvscores[channel][model.__class__.__name__]['scores'] = cross_val_score(estimator=mod, X=self.Xtrn, y=self.ytrn, scoring='mean_squared_error', cv=self.fkfoldi, n_jobs=-1) else: pass # self.prep_arima(channel) # mod = model(**self.arima_params) # cvscore[channel][model.__class__.__name__]['scores'] = cross_val_score(estimator=mod, X=self.xts, y=self.yts, scoring='mean_squared_error', cv=self.ffkoldi, n_jobs=-1) pickle.dump(self.cvscores, open('pickle_pile/cross_val_SMOrc.pkl', 'wb')) # for estimator in models: # self.cvscores[estimator] = {} # for channel in channels: # print 'Running ',estimator.__class__.__name__, ' for channel: ', channel # self.kappa_search(channel, estimator) # self.cvscores[estimator][channel] = {} # if estimator.__class__.__name__ != 'ARIMA': # self.cvscores[estimator][channel][params] = self.gridsearch.best_params_ # self.cvscores[estimator][channel][scores] = cross_val_score(estimator(**self.gridsearch.best_params_),self.Xtrn, self.ytrn, scoring='mean_squared_error', cv=self.fkfoldi) # else: # cvscore[estimator][channel][scores] = cross_val_score(estimator(**arima_params), scoring='mean_squared_error', cv=self.fkoldi) # pickle.dump(self.cvscores, open('pickle_pile/cross_val_SMOrc.pkl', wb)) # def DansGame(self, channel): # # self.cfilt(channel) # self._make_holdout_split(self.cdf) # self._fchain_kfold_indicies() # # self.rfr = RandomForestRegressor(**self.cvparams[channel][RandomForestRegressor]) # # pass def run_arima(self):#use current build ''' DEPRECATED: Primarily used for testing/debugging. Runs statsmodels ARIMA. ''' self.xts = self.X_train.set_index('date') self.yts = self.y_train.set_index('date') self.yts.astype('float', inplace=True) self.arimod = ARIMA(endog = self.yts, order = (2,1,2))#, exog=self.xts) self.aresults = self.arimod.fit() def prep_arima(self, channel='lirik'):#use current build ''' DEPRECATED: Required to 'prep' due to statsmodels' ARIMA not following the same flow as primarily used sklearn models. Abandoned in-lieu of R arima methods. ''' self.cfilt(channel) self._make_holdout_split(self.cdf) self._fchain_kfold_indicies() self.xts = self.X_train.set_index('date') self.yts = self.y_train.set_index('date') self.yts.astype('float', inplace=True) self.arima_params = {'endog': self.yts, 'order': (2,1,2)} # self.arimod = ARIMA(endog = self.yts, order = (2,1,2))#, exog=self.xts) # self.aresults = self.arimod.fit() # # def EleGiggle(self, tname, dname, ivars): # self.r = robjects.r(""" # tset = read.csv("{0}") # dset = read.csv("{1}") # y = dset["AVG CCV's"] # features = {2} # X = train_set[features] # X_test = test_set[features] # fit = auto.arima(y, xreg=X) # ypred = forecast(fit, xreg=X_test) # """.format(train_name, test_name, ivars)) # rp = robjects.r("""ypred['mean']""")[0] # ypred = [rp[i] for i in range(len(rp))] # return ypred def SeemsGood(self): ''' Input: Output: Saves figures comparing model performance for each channel listed below. ''' channels = ['lirik', 'nl_kripp', 'imaqtpie', 'summit1g'] # channels = ['lirik'] tscores = {} for channel in channels: tscores[channel] = {} self.cfilt(channel) self._make_holdout_split(self.cdf) self._fchain_kfold_indicies() # self.prep_arima(channel) # self.arimod = ARIMA(**self.arima_params) self.prep_arima(channel) lassoCV_params = {'cv': self.fkfoldi, 'n_jobs':-1, 'alphas':np.logspace(-4,2,100)} ridgeCV_params = {'cv': self.fkfoldi, 'alphas':np.logspace(-4,2,100), 'scoring':'mean_squared_error'} lassy = LassoCV(**lassoCV_params) ridge = RidgeCV(**ridgeCV_params) lassy.fit(self.Xtrn,self.ytrn) ridge.fit(self.Xtrn, self.ytrn) x = {} yp = {} y = {} mses = {} # resA = [] for j in xrange(len(self.kfolds)): Xtrain_indices, ytrain_indices = self.train_kfoldi[j] Xtest_indices, ytest_indices = self.test_kfoldi[j] xtrain = self.Xtrn.iloc[Xtrain_indices] ytrain = self.ytrn.iloc[ytrain_indices] xtest = self.Xtrn.iloc[Xtest_indices] ytest = self.ytrn.iloc[ytest_indices] # models = [RandomForestRegressor(**self.cvparams[channel]['RandomForestRegressor']['params'])] models = [RandomForestRegressor(**self.cvparams[channel]['RandomForestRegressor']['params']), GradientBoostingRegressor(**self.cvparams[channel]['GradientBoostingRegressor']['params']),Ridge(ridge.alpha_)] #Lasso(lassy.alpha_)]#, ] # models = [ARIMA(endog=yits, order= (2,1,2))] #exog=xits)] # models = ['ARIMA'] for mod in models: mod_name = mod.__class__.__name__ print '******' print mod_name print '******' if j==0: x[mod_name], y[mod_name], yp[mod_name], mses[mod_name] = [], [], [], [] if mod_name != 'ARIMA': mod.fit(xtrain,ytrain) ypred = mod.predict(xtest) yp[mod_name].append(ypred) y[mod_name].append(ytest.values) x[mod_name].append(xtest) mses[mod_name].append(mean_squared_error(ytest.values,ypred)) # elif mod=='ARIMA': # yits = self.y_train.iloc[ytrain_indices] # yits.set_index('date',inplace=True) # yits.astype('float', inplace=True) # xits = self.X_train.iloc[Xtrain_indices] # xits.set_index('date',inplace=True) # xits.astype('float', inplace=True) # # ydts = self.y_train.iloc[ytest_indices] # ydts.set_index('date',inplace=True) # ydts.astype('float', inplace=True) # xdts = self.X_train.iloc[Xtest_indices] # xdts.set_index('date',inplace=True) # xdts.astype('float', inplace=True) # ivars = 'c({})'.format(u' '.join(list(xits.columns)).encode('utf-8')[1:-1]) # tfold = pd.concat([xits,yits], axis=1) # tname = 'pickle_pile/tfold{0}.csv'.format(j) # # tname = 'tfold{0}.csv'.format(j) # # tname = ''.join(tname).encode('utf-8') # tfold.to_csv(tname) # # # tname = 'pickle_pile/tfold.csv' # # dfold = pd.concat([xdts,ydts], axis=1) # dname = 'pickle_pile/dfold{0}.csv'.format(j) # # dname = 'pickle_pile/dfold.csv' # dfold = to_csv(dname) # try: # ypred = self.EleGiggle(tname, dname, ivars) # yp[mod_name].append(ypred) # y[mod_name].append(ytest.values) # x[mod_name].append(xtest) # mses[mod_name].append(mean_squared_error(ytest.values,ypred)) # except: # print 'Nope' # else: # ares = mod.fit() # ypred= ares.predict(start=ydts.index.min(), end=ydts.index.max()) # yp[mod_name].append(ypred) # y[mod_name].append(ytest.values) # x[mod_name].append(xtest) # mses[mod_name].append(mean_squared_error(ytest.values,ypred)) self.xym = {} fig = plt.figure(figsize=(16,10)) plt.tick_params(labelsize=18) for mod in models: mod_name = mod.__class__.__name__ self.xym[mod_name] = {} self.xym[mod_name]['y'] = list(itertools.chain.from_iterable(y[mod_name])) self.xym[mod_name]['x'] = list(itertools.chain.from_iterable(x[mod_name])) self.xym[mod_name]['yp'] = list(itertools.chain.from_iterable(yp[mod_name])) self.xym[mod_name]['rmse'] = np.average(np.sqrt(mses[mod_name])) if mod_name=='RandomForestRegressor': plt.plot(self.xym[mod_name]['yp'], color='blue', linewidth=2, label=mod_name+' rmse: {0}'.format(self.xym[mod_name]['rmse'])) elif mod_name=='GradientBoostingRegressor': plt.plot(self.xym[mod_name]['yp'], 'g--', linewidth=2, label=mod_name+' rmse: {0}'.format(self.xym[mod_name]['rmse'])) elif mod_name=='Ridge': plt.plot(self.xym[mod_name]['yp'], 'r--', linewidth=2, label=mod_name+' rmse: {0}'.format(self.xym[mod_name]['rmse'])) # plt.scatter(self.Xtrn.index[:len(self.xym[mod_name]['y'])], self.xym[mod_name]['y'], color='green', marker='o', label='true') plt.plot(self.xym[mod_name]['y'], color='magenta', label='true', linewidth=2) plt.legend(loc='best', prop={'size':14}) # plt.show() plt.savefig('figures/cv_{0}_rflmodel.png'.format(channel))
plt.legend(loc='best') plt.title('Log - expwighted_avg - moving_avg') plt.show(block=False) # In[16]: ts_log_diff = ts_log - ts_log.shift() plt.plot(ts_log_diff) ts_log_diff.dropna(inplace=True) # In[24]: from statsmodels.tsa.arima_model import ARIMA model = ARIMA(ts_log, order=(2, 1, 2)) results_ARIMA = model.fit(disp=-1) predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True) predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum() predictions_ARIMA_log = pd.Series(ts_log.iloc[0], index=ts_log.index) predictions_ARIMA_log = predictions_ARIMA_log.add( predictions_ARIMA_diff_cumsum, fill_value=0) predictions_ARIMA = np.exp(predictions_ARIMA_log) plt.plot(ts) plt.plot(predictions_ARIMA) plt.title('RMSE: %.4f' % np.sqrt(sum((predictions_ARIMA - ts)**2) / len(ts)))
from statsmodels.tsa.arima_model import ARIMA from scipy.stats import gaussian_kde from statsmodels.tsa.stattools import adfuller from statsmodels.tsa.seasonal import seasonal_decompose def norm(x): return (x-np.min(x))/(np.max(x)-np.min(x)) dataframe = pd.read_csv('Chaotic_TimeSeries_turkey_elec.csv') dataframe.head() plt.plot(dataframe) autocorrelation_plot(dataframe.ix[:,0]) ### AVALIAR V3 LINHAS model00 = ARIMA(np.array(dataframe.ix[:,0]), dates=None,order=(2,1,0)) model11 = model00.fit(disp=1) model11.summary() model11.forecast() resid9=model11.resid np.mean(abs(resid9))/max(np.array(dataframe.ix[:,0])) x3 = resid9 x3 = x3[numpy.logical_not(numpy.isnan(x3))] dftest13 = adfuller(x3, autolag='AIC') dfoutput1 = pd.Series(dftest13[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) print('Dickey Fuller Test:\n',dfoutput1) look_back=200 start=0 end=len(resid9) lag=look_back
ax[1].set_title("First-order differences of DJIA during Jan 2016-Dec 2016") # plot signal plotds(first_order_diff, nlag=50) adf_result = adfuller(first_order_diff) print("ADF Statistic: %f" % adf_result[0]) print("p-value: %f" % adf_result[1]) # Optimize ARMA parameters aicVal = [] for d in range(1, 3): for ari in range(0, 3): for maj in range(0, 3): try: arima_obj = ARIMA(djia_df["Close"].tolist(), order=(ari, d, maj)) arima_obj_fit = arima_obj.fit() aicVal.append([ari, d, maj, arima_obj_fit.aic]) except ValueError: pass # Optimal ARIMA model arima_obj = ARIMA(djia_df["Close"].tolist(), order=(0, 2, 1)) arima_obj_fit = arima_obj.fit(disp=0) arima_obj_fit.summary() # Evaluate prediction pred = np.append([0, 0], arima_obj_fit.fittedvalues.tolist()) djia_df["ARIMA"] = pred diffval = np.append([0, 0], arima_obj_fit.resid + arima_obj_fit.fittedvalues) djia_df["diffval"] = diffval
pplt.autoscale(enable=True, axis='x', tight=None) pplt.show() # In[26]: decomposition = seasonal_decompose(calc_ent2.entropy.values, freq=24) fig = plt.figure() fig = decomposition.plot() fig.set_size_inches(15, 8) # In[44]: model=ARIMA(calc_ent2['entropy'],(1,0,0)) ## The endogenous variable needs to be type Float or you get a cast error model_fit = model.fit() # fit is a Function model_fitted = model_fit.fittedvalues # fittedvalues is a Series print(model_fit.summary()) print(model_fitted) # In[29]: from pprint import pprint # get a variety of different attributes from the object (including functions) #pprint (dir(model)) #pprint (dir(model_fit)) # In[30]: print(model.endog_names)
from scipy.stats import gaussian_kde from statsmodels.tsa.stattools import adfuller from statsmodels.tsa.seasonal import seasonal_decompose def norm(x): return (x-np.min(x))/(np.max(x)-np.min(x)) start=0 end=-10 dataframe = pd.read_csv('Apple_Data_300.csv')[start:end] dataframe.head() autocorrelation_plot(dataframe.ix[:,4]) ### AVALIAR V3 LINHAS model00 = ARIMA(np.array(dataframe.ix[:,4]), dates=None,order=(2,1,0)) model11 = model00.fit(disp=1) model11.summary() model11.forecast() resid9=model11.resid np.mean(abs(resid9))/max(np.array(dataframe.ix[:,4])) x3 = resid9 x3 = x3[numpy.logical_not(numpy.isnan(x3))] dftest13 = adfuller(x3, autolag='AIC') dfoutput1 = pd.Series(dftest13[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) print('Dickey Fuller Test:\n',dfoutput1) look_back=200 start=0 end=len(resid9) lag=look_back
# decide the structure (p,q) of the model ------------------------------------ fig = plt.figure(figsize=(12,8)) ax1 = fig.add_subplot(211) fig = sm.graphics.tsa.plot_acf(residual, lags=40, ax=ax1) ax2 = fig.add_subplot(212) fig = sm.graphics.tsa.plot_pacf(residual, lags=40, ax=ax2) plt.show() ## decide the parameter of the model ------------------------------------------ from statsmodels.tsa.arima_model import ARIMA model = ARIMA(residual, order=(3, 0, 2)) results_ARIMA = model.fit(disp=-1) ARIMA_predict=results_ARIMA.predict('1959-07-01','1969-12-01') ARIMA_all=pd.concat([results_ARIMA.fittedvalues,ARIMA_predict]) plt.plot(residual_1,color='k') plt.plot(residual) plt.plot(results_ARIMA.fittedvalues, color='red') plt.plot(ARIMA_all, color='red') plt.show()
def feature_selecion(): start_date = '2016-06-01' end_date = '2016-07-01' data_file = "static/data/GBPUSD/DAT_MT_GBPUSD_M1_2016.csv" news = ["Brexit", "US presidential election 2012"] currency = ["GBP/USD", "EUR/USD"] example_number = 0 #price data = read_csv(data_file) data['Time'] = data[['Date', 'Time']].apply(lambda x: ' '.join(x), axis=1) data['Time'] = data['Time'].apply( lambda x: to_datetime(x) - timedelta(hours=2)) data.index = data.Time mask = (data.index > start_date) & (data.index <= end_date) data = data.loc[mask] series = data["Close"] #price and the gradient fig = plt.figure() ax3 = fig.add_subplot(211) ax3.plot(series) ax3.set_title(currency[example_number] + ' prices during ' + news[example_number] + ' time period') ax3.set_xlabel('Time') ax3.set_ylabel('Price') np_array_series = np.array(data['Close']) np_array_dates = np.array(data.index) gradients = np.gradient(np_array_series) ax1 = fig.add_subplot(212) ax1.set_title('Gradients of the price series') ax1.set_xlabel('Time') ax1.set_ylabel('Gradient') ax1.plot(np_array_dates, gradients) fig.savefig("static/anomalies/feature_lection_image1.png") price_list = series.values ADF_result_price = adfuller(price_list) print('ADF Statistic: for series %f' % ADF_result_price[0]) print('p-value: %f' % ADF_result_price[1]) #p-value: 0.668171 print('Critical Values:') for key, value in ADF_result_price[4].items(): print('\t%s: %.3f' % (key, value)) #create log return series series_log_ret = np.log(data.Close) - np.log(data.Close.shift(1)) series_log_ret = series_log_ret.dropna() log_return_list = series_log_ret.values ADF_result_log_return = adfuller(log_return_list) print('ADF Statistic: for series_log_ret %f' % ADF_result_log_return[0]) print( 'p-value: %f' % ADF_result_log_return[1] ) #p-value: 0.000000 therefore, null hypothesis is rejected. the system is stationary print('Critical Values:') for key, value in ADF_result_log_return[4].items(): print('\t%s: %.3f' % (key, value)) input_series = [] #testing for stationarity in series if ADF_result_price[0] < 0.05: input_series = price_list else: input_series = log_return_list #Creating the ARIMA model arima_model = ARIMA(series_log_ret, order=(4, 1, 1)) model_fit = arima_model.fit(disp=0) print(model_fit.summary()) #tsaplots.plot_acf(series_log_ret, lags=30) #tsaplots.plot_pacf(series_log_ret, lags=30) #Getting the residual series residuals = pd.DataFrame(model_fit.resid) #np.square(residuals).plot() residual_list = residuals.values residual_squared = list() for x in residual_list: residual_squared.append(x[0]) #checking for stationarity in the residual series ADF_result_residual_squared = adfuller(residual_squared) print('ADF Statistic: for residuals %f' % ADF_result_residual_squared[0]) print( 'p-value: %f' % ADF_result_residual_squared[1] ) #p-value: 0.000000 therefore, null hypothesis is rejected. the system is stationary print('Critical Values:') for key, value in ADF_result_residual_squared[4].items(): print('\t%s: %.3f' % (key, value)) #different configurations for GARCH model configurations = [[2, 0, 0], [2, 0, 1], [1, 0, 0], [1, 0, 1]] opt_model = {} opt_configuration = [] #getting the most suitable configuration for i in range(len(configurations)): BIC = np.inf garch_model = arch_model(series_log_ret, p=configurations[i][0], o=configurations[i][1], q=configurations[i][2]) model = garch_model.fit(update_freq=5) if BIC > model.bic: BIC = model.bic opt_model = model opt_configuration = configurations[i] print(opt_model.summary()) conditional_volatilit = opt_model.conditional_volatility #https://plot.ly/matplotlib/subplots/ for four #for three #ax1 = fig.add_subplot(221) fig = plt.figure() ax3 = fig.add_subplot(221) ax3.plot(series) ax3.set_title(currency[example_number] + ' prices during ' + news[example_number] + ' time period') ax3.set_xlabel('Time') ax3.set_ylabel('Price') ax2 = fig.add_subplot(222) ax2.plot(conditional_volatilit) ax2.set_title('Conditional Volatility') ax2.set_xlabel('Time') ax2.set_ylabel('Conditional Volatility') ax1 = fig.add_subplot(223) ax1.plot(np_array_dates, gradients) ax1.set_title('Gradients: ' + currency[example_number] + ' prices during ' + news[example_number]) ax1.set_xlabel('Time') ax1.set_ylabel('Gradient') np_array_CH = np.array(conditional_volatilit) np_array_CH_dates = np.array(conditional_volatilit.index) gradients_CH = np.gradient(np_array_CH) ax4 = fig.add_subplot(224) ax4.plot(np_array_CH_dates, gradients_CH) ax4.set_title('Gradients: Conditional Volatility') ax4.set_xlabel('Time') ax4.set_ylabel('Gradient') fig.savefig("static/anomalies/feature_lection_image2.png") df_CH = pd.DataFrame() df_CH['Index'] = np_array_CH_dates df_CH['CH_Gradient'] = gradients_CH df_CH.index = df_CH['Index'] df_CH['CH'] = conditional_volatilit df_CH = df_CH.drop(['Index'], axis=1) df_price = pd.DataFrame() df_price['Index'] = np_array_dates df_price['Price_Gradient'] = gradients df_price.index = df_price['Index'] df_price['Price'] = series df_price = df_price.drop(['Index'], axis=1) features = pd.concat([df_price, df_CH], axis=1) features = features.dropna(axis=0) print(features) features.to_csv('static/anomalies/features.csv') return "done"
def arima_models(ts_log, p, d, q): model = ARIMA(ts_log, order = (p, d, q)) results = model.fit(disp = -1) return results
ax2.set_xlabel('Lag') ax2.set_ylabel('Partial Autocorrelation') plt.show() from statsmodels.tsa.arima_model import ARIMA import warnings cols = train.columns[1:-1] for key in top_pages: data = np.array(train.loc[top_pages[key],cols],'f') result = None with warnings.catch_warnings(): warnings.filterwarnings('ignore') try: arima = ARIMA(data,[2,1,4]) result = arima.fit(disp=False) except: try: arima = ARIMA(data,[2,1,2]) result = arima.fit(disp=False) except: print(train.loc[top_pages[key],'Page']) print('\tARIMA failed') #print(result.params) pred = result.predict(2,599,typ='levels') x = [i for i in range(600)] i=0 plt.plot(x[2:len(data)],data[2:] ,label='Data') plt.plot(x[2:],pred,label='ARIMA Model') plt.title(train.loc[top_pages[key],'Page'])
for key, value in result2[4].items(): print('\t%s: %.3f' % (key, value)) ## INDPRO first differenced log result3 = adfuller(d_ln_indpro_temp) print('ADF Statistic: %f' % result3[0]) print('p-value: %f' % result3[1]) print('Critical Values:') for key, value in result3[4].items(): print('\t%s: %.3f' % (key, value)) # ARIMA INDPRO ## fit model ARIMA(4,1,0), differencing done ## by ARIMA model = ARIMA(indpro, order=(3, 1, 0)) model_fit = model.fit(disp=0) print(model_fit.summary()) ## fit model ARIMA(4,0,0), differencing done by me ## beforehand. So this is essentially an ARMA(4, 0) ## model on the already differenced data d_indpro = pd.DataFrame(d_indpro_temp) model2 = ARIMA(d_indpro, order=(3, 0, 0)) model_fit2 = model2.fit(disp=0) print(model_fit2.summary()) ### model2 is equivalent to model ### hence, my differenced series is differenced ### in the same way as the ARIMA function differences residuals = pd.DataFrame(model_fit.resid)
X = dataset.transpose() #将dataset矩阵转置 plot_acf(X, lags=24) plot_pacf(X, lags=24) pyplot.show() # %% size = 24 * 7 train, test = X[:size], X[size:len(X)] forecast = numpy.zeros(len(test)) bound = numpy.zeros((len(test), 2)) step = 4 for t in range(0, len(test), step): print(t) model = ARIMA(train, order=(7, 0, 0)) model_fit = model.fit() output = model_fit.forecast(step, alpha=.05) forecast[t:t + step] = output[0] bound[t:t + step, :] = output[2] #??? train = numpy.append(train, test[t:t + step]) error = mean_absolute_error(test, forecast) print('Test MAE: %.3f' % error) # %% plot #============================================================================== pyplot.show() timeline = numpy.arange(0, len(test)) baseline = numpy.zeros(len(test)) residual = test - forecast
def arima_with_data_transformation(): # For Arima func, our data has to be stationary. So we check it. test_stationarity(ts) # Generate log series so we'll have stationary data log_d = np.log(ts) # Show the data after the transformaion to log plt.plot(log_d, color='red') plt.show() # Generate shifted sereis log_d_diff = log_d - log_d.shift() log_d_diff.dropna(inplace=True) # Test if now we have stationariy data test_stationarity(log_d_diff) # Check autocorrelation lag_acf = acf(log_d_diff, nlags=20) # Check autocorrelation after reduction the previos elemnts lag_pacf = pacf(log_d_diff, nlags=20, method='ols') #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(log_d_diff)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(log_d_diff)), linestyle='--', color='gray') plt.title('Autocorrelation Function') plt.show() #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0, linestyle='--', color='gray') plt.axhline(y=-1.96 / np.sqrt(len(log_d_diff)), linestyle='--', color='gray') plt.axhline(y=1.96 / np.sqrt(len(log_d_diff)), linestyle='--', color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout() plt.show() # Run Arima model model = ARIMA(log_d, order=(2, 1, 0)) results_ARIMA = model.fit(disp=-1) plt.plot(log_d_diff, color='red') plt.plot(results_ARIMA.fittedvalues, color='yellow') plt.show() #scale it back to the original values predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True) predictions_ARIMA_diff_cumsum = -predictions_ARIMA_diff.cumsum() predictions_ARIMA_log = pd.Series(log_d.ix[0], index=log_d.index) predictions_ARIMA_log = predictions_ARIMA_log.add( predictions_ARIMA_diff_cumsum, fill_value=0) predictions_ARIMA = np.exp(predictions_ARIMA_log) plt.plot(ts, color='yellow') plt.plot(predictions_ARIMA, color='green') plt.title('RMSE: %.4f' % np.sqrt(sum((predictions_ARIMA - ts)**2) / len(ts))) plt.show()
result = pd.DataFrame(columns=['artist_id', 'plays', 'Ds']) # fit data for aid in arts['artist_id']: one = arts[arts.artist_id == aid] one.pop('artist_id') ts = pd.Series(data=one['plays'], index=one.index) # log ts_log = np.log(ts) # difference ts_log_diff = ts_log - ts_log.shift(7) ts_log_diff.dropna(inplace=True) # arima model = ARIMA(ts_log_diff, order=(7, 0, 0)) results_ARIMA = model.fit(maxiter=1000000) # fit predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True) predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum() predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index) predictions_ARIMA_log = predictions_ARIMA_log.add( predictions_ARIMA_diff_cumsum, fill_value=0) predictions_ARIMA = np.exp(predictions_ARIMA_log) # predict pred = results_ARIMA.predict(start='20150831', end='20151030') pred_ARIMA_diff_cumsum = pred.cumsum() pred_ARIMA_log = pd.Series(ts_log.ix[len(ts_log) - 1], index=pred.index) pred_ARIMA_log = pred_ARIMA_log.add(pred_ARIMA_diff_cumsum, fill_value=0) pred_ARIMA = np.exp(pred_ARIMA_log) # plot fig, ax = plt.subplots(nrows=1, ncols=1)
mpl.rcParams['axes.unicode_minus'] = False x = data['Passengers'].astype(np.float) x = np.log(x) print x.head(10) show = 'prime' # 'diff', 'ma', 'prime' d = 1 diff = x - x.shift(periods=d) ma = x.rolling(window=12).mean() xma = x - ma p = 2 q = 2 model = ARIMA(endog=x, order=(p, d, q)) # 自回归函数p,差分d,移动平均数q arima = model.fit(disp=-1) # disp<0:不输出过程 prediction = arima.fittedvalues print type(prediction) y = prediction.cumsum() + x[0] mse = ((x - y)**2).mean() rmse = np.sqrt(mse) plt.figure(facecolor='w') if show == 'diff': plt.plot(x, 'r-', lw=2, label=u'原始数据') plt.plot(diff, 'g-', lw=2, label=u'%d阶差分' % d) #plt.plot(prediction, 'r-', lw=2, label=u'预测数据') title = u'乘客人数变化曲线 - 取对数' elif show == 'ma': #plt.plot(x, 'r-', lw=2, label=u'原始数据') #plt.plot(ma, 'g-', lw=2, label=u'滑动平均数据')
plot_pacf(xt, lags=50, ax=ax_pacf) plt.tight_layout() return None # plotting data plotds(Nifty_data['Close'], nlag=50) #plotting QQ plot and probability plot sm.qqplot(Nifty_data['Close'], line='s') # Optimize ARIMA parameters aicVal = [] for d in range(0, 3): for ari in range(0, 3): for maj in range(0, 3): try: arima_obj1 = ARIMA(train.tolist(), order=(ari, d, maj)) arima_obj1_fit = arima_obj1.fit() aicVal.append([ari, d, maj, arima_obj1_fit.aic]) except: pass print(aicVal) pred = np.append([0, 0], arima_obj1_fit.fittedvalues.tolist()) import sklearn sklearn.metrics.r2_score(arima_obj1_fit, pred)