def forecast_confirm_cases(cls, dataframe, jsondata): """ This method will forecast the cases. """ temp_int, temp_list, last_days = 0, list(), 15 for _, value in enumerate(dataframe[jsondata["State"]]): temp_int = temp_int + value temp_list.append(temp_int) autoreg = AutoReg(temp_list[-15:], lags=1, trend="t") model = autoreg.fit() predicted_values = model.predict(last_days, last_days+6) predicted_values = [int(val) for val in predicted_values] last_date = list(dataframe["date"])[-1] date = [datetime.strptime(str(value).split()[0], '%Y-%m-%d').strftime("%d-%b")\ for value in pd.date_range(datetime.strptime(last_date, '%d-%b-%y'),\ periods=8).tolist()][1:] with plt.rc_context({'axes.edgecolor': 'darkkhaki', 'xtick.color':'darkkhaki',\ 'ytick.color':'darkkhaki'}): fig = model.plot_predict(last_days, last_days+6, alpha=0.05, figsize=FIGSIZE) plt.plot(range(7), predicted_values, "ro") axes = fig.add_subplot() plt.title("Confirm Case Forecasting", color="darkkhaki") axes.yaxis.tick_right() for index, value in enumerate(predicted_values): axes.text(index, value, f"{value}",\ horizontalalignment='center', color="aqua", fontsize=12) plt.xticks(range(len(date)), date, size='small') plt.savefig(os.path.join(GRAPH_FOLDER, "forecasted_graph.png"), transparent=True) return "forecasted_graph.png"
def buildARModel(train_data_path, test_data_path): print("\nBuilding Auto-Regression Model ...") df = pd.read_csv(train_data_path) df['TIMESTAMP'] = df['TIMESTAMP'].astype('datetime64') df.set_index('TIMESTAMP', inplace=True) df = df.resample('1M').mean() train_series = df['RENEWABLES_PCT'] dt = pd.read_csv(test_data_path) dt['TIMESTAMP'] = dt['TIMESTAMP'].astype('datetime64') dt.set_index('TIMESTAMP', inplace=True) dt = dt.resample('1M').mean() test_series = dt['RENEWABLES_PCT'] model = AutoReg(train_series, lags=len(test_series) - 1) model_fit = model.fit() print('Coefficients: %s' % model_fit.params) predictions = model_fit.predict(start=len(train_series), end=len(train_series) + len(test_series) - 1, dynamic=False) rmse = sqrt(metrics.mean_squared_error(test_series, predictions)) plt.plot(test_series) plt.plot(predictions, color='red') #plt.show() print('ARModel RMSE: %.2f' % rmse) return rmse
def transform(self, seriess, debug): series = seriess[0] pdseries = series.pdseries lags, period, output = self.get_params() calc_lags = lags_range_timedelta_to_period(lags, series.step()) calc_period = timedelta_to_period( period, series.step()) if period is not None else None include_seasonal = calc_period is not None ar = AutoReg(pdseries, lags=calc_lags, seasonal=include_seasonal, period=calc_period, old_names=False) model = ar.fit() # Debug info if debug: debug_info = {"summary": str(model.summary())} else: debug_info = {} result = None if output == 'predicted': result = model.fittedvalues elif output == 'resid': result = model.resid else: raise ValueError('Invalid output: ' + output) # print(model.summary()) return (result, debug_info)
def ar1model(data, lag=1): """ Returns an AR(1) model """ ar1 = AutoReg(data, lags=lag, old_names=True) ar1_fit = ar1.fit() return ar1_fit
def AR(train_dt=train, test_dt=test, lag=1): model=AutoReg(train_dt, lags=lag) model_fit=model.fit() #print('Coefficients: %s' % model_fit.params) predictions_train=model_fit.predict(start=1, end=len(train_dt), dynamic=False) predictions_test=model_fit.predict(start=len(train_dt), end=len(train_dt)+len(test_dt)-1, dynamic=False) return(predictions_train, predictions_test)
def experiment_type(series, lag, n_repeats, folds): data = np.array(np.array_split(series, folds)) trends = ['c', 'ct', 't'] type_error = [] x = 0 for t in trends: errors = [] for r in range(n_repeats): for k in np.arange(2, folds): for j in np.arange(0, k): test = data[k] train = np.concatenate(data[j:k]).ravel() if len(train) > i: #fit autoregressive model model = AutoReg(train, lags=lag, trend=t) try: model_fit = model.fit() except: print("Couldnt fit model!") predictions = pred(train, test, lag, model_fit.params) rmse = math.sqrt(mean_squared_error(test, predictions)) errors.append(rmse) else: print("Dataset too small for %f folds" % folds) type_error.append(np.average(errors)) return type_error
def test_autoreg_info_criterion(lag): data = sm.datasets.sunspots.load(as_pandas=False) endog = data.endog endog_tmp = endog[16 - lag:] r = AutoReg(endog_tmp, lags=lag, old_names=False).fit() # See issue #324 for the corrections vs. R k_ar = len(r.model.ar_lags) k_trend = 1 log_sigma2 = np.log(r.sigma2) aic = r.aic aic = (aic - log_sigma2) * (1 + k_ar) / (1 + k_ar + k_trend) aic += log_sigma2 hqic = r.hqic hqic = (hqic - log_sigma2) * (1 + k_ar) / (1 + k_ar + k_trend) hqic += log_sigma2 bic = r.bic bic = (bic - log_sigma2) * (1 + k_ar) / (1 + k_ar + k_trend) bic += log_sigma2 res1 = np.array([aic, hqic, bic, r.fpe]) # aic correction to match R res2 = results_ar.ARLagResults("const").ic.T assert_almost_equal(res1, res2[lag - 1, :], DECIMAL_6) r2 = AutoReg(endog, lags=lag, hold_back=16, old_names=False).fit() assert_allclose(r.aic, r2.aic) assert_allclose(r.bic, r2.bic) assert_allclose(r.hqic, r2.hqic) assert_allclose(r.fpe, r2.fpe)
def predict_country_ar(country, ndays, dataset): """Project daily cases numbers for a given country, desired number of days and input dataframe. Currently uses an autoregression model from the statsmodels module, and predicts based on the rolling seven day average cases.""" ts_data = dataset[dataset['country'] == country]['daily_cases_change_av'] ts_data = pd.DataFrame(ts_data) batch = ts_data.values #makes predictions for successive days, #and appends these to the existing data #before predicting the next day for __ in range(ndays): model = AutoReg(batch, lags=1) model_fit = model.fit() pred = model_fit.predict(len(batch), len(batch)) batch = np.append(batch, pred) preds = batch[-ndays:] add_dates = [ ts_data.index[-1] + DateOffset(days=x) for x in range(ndays + 1) ] future_dates = pd.DataFrame(index=add_dates[1:], columns=ts_data.columns) df_predict = pd.DataFrame(preds, index=future_dates[-ndays:].index, columns=['prediction']) df_proj = pd.concat([ts_data, df_predict], axis=1) return df_proj
def test_dynamic_predictions_oos(ar2): mod = AutoReg(ar2, 2, trend="c") res = mod.fit() d25_end = res.predict(dynamic=25, end=61) s10_d15_end = res.predict(start=10, dynamic=15, end=61) end = ar2.index[-1] + 12 * (ar2.index[-1] - ar2.index[-2]) sd_index_end = res.predict(start=ar2.index[10], dynamic=ar2.index[25], end=end) assert_allclose(s10_d15_end, sd_index_end) assert_allclose(d25_end[25:], sd_index_end[15:]) reference = [np.nan, np.nan] p = np.asarray(res.params) for i in range(2, d25_end.shape[0]): if i < ar2.shape[0]: lag1 = ar2[i - 1] lag2 = ar2[i - 2] if i > 25: lag1 = reference[i - 1] if i > 26: lag2 = reference[i - 2] reference.append(p[0] + p[1] * lag1 + p[2] * lag2) expected = pd.Series(reference, index=d25_end.index) assert_allclose(expected, d25_end)
def forecast(handler, message): # A US manufacturer buys raw materials in multiple currencies purchases = pd.read_excel('Purchases.xlsx') # For each of those currencies, find the best model to forecast prices best_model = {} for currency in purchases.currency: handler.write_message('Currency %s' % currency) data = pd.read_excel(f'{currency}.xlsx') data = data[data[currency] > 0] best_aic, best_fit = inf, None # for lags in (3, 5, 7, 10, 14, 28, 60, 90, 120, 183, 365, 730, 1095): for lags in (3, 5, 7, 10): handler.write_message('Lag %s' % lags) model = AutoReg(data[currency], lags=lags) fit = model.fit() if fit.aic < best_aic: best_aic, best_fit = fit.aic, fit best_model[currency] = best_fit # Estimate next month's price increase assuming the same volume as today forecasted_value = 0 for index, row in purchases.iterrows(): fit = best_model[row.currency] prices = fit.predict(fit.model.nobs, fit.model.nobs + 30) change = prices.iloc[-1] / prices.iloc[0] forecasted_value += row.value * change handler.write_message( 'Sales value will move from {:,.0f} to {:,.0f}'.format( purchases.value.sum(), forecasted_value))
def extrapolate_moments(mus, fac): """Extrapolate moments""" L = len(mus) // 2 T = len(mus) L = T P = int(fac * T) # prediction train = mus[0:L].real # train data test = mus[L:T] # test data # model = AR(train).fit(ic="aic") # get the model lags = round(12 * (len(train) / 100.)**(1 / 4.)) model = AutoReg(train, lags=lags, trend="ct").fit(cov_type="HC1") # get the model # model = pm.auto_arima(train, start_p=1, start_q=1, # test='adf', # max_p=3, max_q=3, m=10, # start_P=0, seasonal=True, # d=None, D=1, trace=True, # error_action='ignore', # suppress_warnings=True, # stepwise=True) # pred = model.predict(n_periods=P-L) # prediction pred = model.predict(start=L, end=P - 1) # prediction mus2 = np.zeros(P, dtype=np.complex) mus2[0:L] = mus[0:L] # initial data mus2[L:P] = pred[:] # predicted data return mus2
def experiment_length(series, lag, n_repeats, folds): data = np.array(np.array_split(series, folds)) trends = ['c'] length_error = [] for j in np.arange(1, folds): error = [] for r in range(n_repeats): for t in trends: for k in np.arange(j, folds): test = data[k] if (j > 1): train = np.concatenate(data[k - j:k]).ravel() else: train = data[k - j:k][0] #fit autoregressive model model = AutoReg(train, lags=lag, trend=t) try: model_fit = model.fit() except: print("Couldnt fit model!") predictions = pred(train, test, lag, model_fit.params) print(len(train)) rmse = math.sqrt(mean_squared_error(test, predictions)) error.append(rmse) print("training length", len(train)) length_error.append(np.average(error)) return length_error
def fit(self, ar_order=1, ar_trend='n', pdeg=1, nlags=30): """fit the model parameters""" # TODO: is this appropriate, or should I only 'self parameters' # inside __init__? self.ar_order = ar_order self.ar_trend = ar_trend # np.polyfit returns polynomial coefficients with the highest power first self.model_params = np.polyfit(x=self.x, y=self.y, deg=pdeg).flatten() # calculate trend: linear for pdeg=1, quadratic for pdeg=2, etc... self.trend = np.polyval(p=self.model_params, x=self.x) self.residuals = self.y - self.trend # residuals # AR self.pacf = pacf(self.residuals, nlags=nlags) self.alpha = self.pacf[self.ar_order] # TODO: calculate sd based on the formula above to circumvent using the AutoReg class at all (!) AR = AutoReg(endog=self.residuals, lags=self.ar_order, trend=self.ar_trend) # fit AR process ARfit = AR.fit(cov_type="HC0") # robust SE html = ARfit.summary().as_html() # save model results self.sd = float(pd.read_html(html)[0].iloc[2, 3]) # get sd #self.alpha = float(pd.read_html(html)[1].iloc[1,1]) # get autocorrelation self.trend = np.reshape( self.trend, (-1, 1)) # reshape to column vector of shape (n, 1) return self
def test_autoreg_no_variables(ar2): mod = AutoReg(ar2[:10], None, trend="n") res = mod.fit() summary = res.summary() summ_txt = summary.as_text() assert "AutoReg(0)" in summ_txt assert "No Model Parameters" in summ_txt
def train_AR(self): """Train Autoregression model with data """ if len(self._data) < 3: raise Exception('Please provide data with more points') model = AutoReg(self._data["Deaths"], lags=int(len(self._data) / 3)) self._model = model.fit()
def unitroot_test(series): # Basic statistic plt.figure() plt.plot(series) plot_pacf(series) # ADF test # AIC & BIC from lags 12 to 1 print('$p$ & AIC & BIC \\\\') max_lags = 12 for lags in (max_lags - i for i in range(max_lags)): ar_model = AutoReg(series, lags, 'n') res = ar_model.fit() print(f'{lags} & {round(res.aic, 3)} & {round(res.bic, 3)} \\\\') # Best lags by `ar_select_order` sel = ar_select_order(series, max_lags, trend='n') lags = sel.ar_lags[-1] print(f'Lags selection: {sel.ar_lags}') # Start ADF test adf = ADF(series, lags) print(adf.summary()) # PP test pp_tau = PhillipsPerron(series, 3, test_type='tau') # q = 3 pp_rho = PhillipsPerron(series, 3, test_type='rho') # q = 3 print(pp_tau.summary()) print(pp_rho.summary())
def forecast(handler, message): # A US manufacturer buys raw materials in multiple currencies purchases = pd.read_excel('Purchases.xlsx') purchases.sort_values('value', ascending=False, inplace=True) purchases['forecast'] = purchases['value'] # For each of those currencies, find the best model to forecast prices best_model = {} for i, (index, row) in enumerate(purchases.iterrows()): data = pd.read_excel(f'{row.currency}.xlsx') data = data[data[row.currency] > 0] best_aic, best_fit = inf, None # for lags in (3, 5, 7, 10, 14, 28, 60, 90, 120, 183, 365, 730, 1095): for lags in (3, 5, 7, 10, 14): model = AutoReg(data[row.currency], lags=lags) fit = model.fit() if fit.aic < best_aic: best_aic, best_fit = fit.aic, fit prices = best_fit.predict(best_fit.model.nobs, best_fit.model.nobs + 30) change = prices.iloc[-1] / prices.iloc[0] row.loc['forecast'] = purchases.loc[index, 'forecast'] = row.value * change data = row.to_dict() data.update( progress=(i + 1) / len(purchases), total_value=float(purchases.value.sum()), total_forecast=float(purchases.forecast.sum()), ) handler.write_message(data)
def test_parameterless_autoreg(): data = gen_data(250, 0, False) mod = AutoReg(data.endog, 0, trend='n', seasonal=False, exog=None, old_names=False) res = mod.fit() for attr in dir(res): if attr.startswith('_'): continue # TODO if attr in ('predict', 'f_test', 't_test', 'initialize', 'load', 'remove_data', 'save', 't_test', 't_test_pairwise', 'wald_test', 'wald_test_terms'): continue warning = None if attr != "test_serial_correlation" else FutureWarning attr = getattr(res, attr) if callable(attr): with pytest.warns(warning): attr() else: assert isinstance(attr, object)
def test_invalid_dynamic(ar2): mod = AutoReg(ar2, 2, trend="c") res = mod.fit() with pytest.raises(ValueError, match="Dynamic prediction cannot"): res.predict(dynamic=-1) with pytest.raises(ValueError, match="Dynamic prediction cannot"): res.predict(start=ar2.index[10], dynamic=ar2.index[5])
def test_dynamic_predictions(ar2): mod = AutoReg(ar2, 2, trend="c") res = mod.fit() d25 = res.predict(dynamic=25) s10_d15 = res.predict(start=10, dynamic=15) sd_index = res.predict(start=ar2.index[10], dynamic=ar2.index[25]) reference = [np.nan, np.nan] p = np.asarray(res.params) for i in range(2, ar2.shape[0]): lag1 = ar2[i - 1] lag2 = ar2[i - 2] if i > 25: lag1 = reference[i - 1] if i > 26: lag2 = reference[i - 2] reference.append(p[0] + p[1] * lag1 + p[2] * lag2) expected = pd.Series(reference, index=ar2.index) assert_allclose(expected, d25) assert_allclose(s10_d15, sd_index) assert_allclose(d25[25:], sd_index[15:]) full = res.predict() assert_allclose(d25[:25], full[:25])
def autoregression( data=None, col_name=None, ): #https://machinelearningmastery.com/autoregression-models-time-series-forecasting-python/ from pandas import read_csv from matplotlib import pyplot from statsmodels.tsa.ar_model import AutoReg from sklearn.metrics import mean_squared_error from math import sqrt # load dataset series = data[col_name] # split dataset X = series.values train, test = X[1:len(X) - 7], X[len(X) - 7:] # train autoregression model = AutoReg(train, lags=29) model_fit = model.fit() print('Coefficients: %s' % model_fit.params) # make predictions predictions = model_fit.predict(start=len(train), end=len(train) + len(test) - 1, dynamic=False) for i in range(len(predictions)): print('predicted=%f, expected=%f' % (predictions[i], test[i])) rmse = sqrt(mean_squared_error(test, predictions)) print('Test RMSE: %.3f' % rmse) # plot results pyplot.plot(test) pyplot.plot(predictions, color='red') pyplot.show()
def test_predict_irregular_ar(): rs = np.random.RandomState(12345678) e = rs.standard_normal(1001) y = np.empty(1001) y[:3] = e[:3] * np.sqrt(1.0 / (1 - 0.9**2)) for i in range(3, 1001): y[i] = 10 + 0.9 * y[i - 1] - 0.5 * y[i - 3] + e[i] ys = pd.Series(y, index=pd.date_range("1-1-1950", periods=1001, freq="M")) mod = AutoReg(ys, [1, 3], trend="ct", old_names=False) res = mod.fit() c = res.params.iloc[0] t = res.params.iloc[1] ar = np.asarray(res.params.iloc[2:]) pred = res.predict(900, 1100, True) direct = np.zeros(201) direct[0] = c + t * 901 + ar[0] * y[899] + ar[1] * y[897] direct[1] = c + t * 902 + ar[0] * direct[0] + ar[1] * y[898] direct[2] = c + t * 903 + ar[0] * direct[1] + ar[1] * y[899] for i in range(3, 201): direct[i] = c + t * (901 + i) + ar[0] * direct[i - 1] + ar[1] * direct[i - 3] direct = pd.Series(direct, index=pd.date_range(ys.index[900], periods=201, freq="M")) assert_series_equal(pred, direct) pred = res.predict(900) direct = (c + t * np.arange(901, 901 + 101) + ar[0] * y[899:-1] + ar[1] * y[897:-3]) idx = pd.date_range(ys.index[900], periods=101, freq="M") direct = pd.Series(direct, index=idx) assert_series_equal(pred, direct)
def test_autoreg_start(start): y_train = pd.Series(np.random.normal(size=20)) m = AutoReg(y_train, lags=2, old_names=False) mf = m.fit() end = start + 5 pred = mf.predict(start=start, end=end) assert pred.shape[0] == end - start + 1
def test_predict_seasonal(): rs = np.random.RandomState(12345678) e = rs.standard_normal(1001) y = np.empty(1001) y[0] = e[0] * np.sqrt(1.0 / (1 - 0.9**2)) effects = 10 * np.cos(np.arange(12) / 11 * 2 * np.pi) for i in range(1, 1001): y[i] = 10 + 0.9 * y[i - 1] + e[i] + effects[i % 12] ys = pd.Series(y, index=pd.date_range("1-1-1950", periods=1001, freq="M")) mod = AutoReg(ys, 1, seasonal=True, old_names=False) res = mod.fit() c = res.params.iloc[0] seasons = np.zeros(12) seasons[1:] = res.params.iloc[1:-1] ar = res.params.iloc[-1] pred = res.predict(900, 1100, True) direct = np.zeros(201) direct[0] = y[899] * ar + c + seasons[900 % 12] for i in range(1, 201): direct[i] = direct[i - 1] * ar + c + seasons[(900 + i) % 12] direct = pd.Series(direct, index=pd.date_range(ys.index[900], periods=201, freq="M")) assert_series_equal(pred, direct) pred = res.predict(900, dynamic=False) direct = y[899:-1] * ar + c + seasons[np.arange(900, 1001) % 12] direct = pd.Series(direct, index=pd.date_range(ys.index[900], periods=101, freq="M")) assert_series_equal(pred, direct)
def test_parameterless_autoreg(): data = gen_data(250, 0, False) mod = AutoReg(data.endog, 0, trend="n", seasonal=False, exog=None) res = mod.fit() for attr in dir(res): if attr.startswith("_"): continue # TODO if attr in ( "predict", "f_test", "t_test", "initialize", "load", "remove_data", "save", "t_test", "t_test_pairwise", "wald_test", "wald_test_terms", ): continue warning = None if attr != "test_serial_correlation" else FutureWarning attr = getattr(res, attr) if callable(attr): with pytest.warns(warning): attr() else: assert isinstance(attr, object)
def fit(self, x, horizon): fit = AR(x, lags=self.d, old_names=False).fit() point_forecast = fit.get_prediction(len(x), len(x) + horizon - 1) # print(f'AR {self.d}:', point_forecast.predicted_mean) # print(point_forecast.conf_int(0.05)) return point_forecast.predicted_mean
def test_spec_errors(): data = gen_data(250, 2, True) with pytest.raises(ValueError, match='lags must be a positive scalar'): AutoReg(data.endog, -1, old_names=False) with pytest.raises(ValueError, match='All values in lags must be pos'): AutoReg(data.endog, [1, 1, 1], old_names=False) with pytest.raises(ValueError, match='All values in lags must be pos'): AutoReg(data.endog, [1, -2, 3], old_names=False)
def test_autoreg_score(): data = sm.datasets.sunspots.load_pandas() ar = AutoReg(np.asarray(data.endog), 3, old_names=False) res = ar.fit() score = ar.score(res.params) assert isinstance(score, np.ndarray) assert score.shape == (4, ) assert ar.information(res.params).shape == (4, 4)
def test_spec_errors(): data = gen_data(250, 2, True) with pytest.raises(ValueError, match="lags must be a positive scalar"): AutoReg(data.endog, -1) with pytest.raises(ValueError, match="All values in lags must be pos"): AutoReg(data.endog, [1, 1, 1]) with pytest.raises(ValueError, match="All values in lags must be pos"): AutoReg(data.endog, [1, -2, 3])
def ols_autoreg_result(request): ar, seasonal, trend, exog, cov_type = request.param y, x, endog, exog = gen_ols_regressors(ar, seasonal, trend, exog) ar_mod = AutoReg(y, ar, seasonal=seasonal, trend=trend, exog=x) ar_res = ar_mod.fit(cov_type=cov_type) ols = OLS(endog, exog) ols_res = ols.fit(cov_type=cov_type, use_t=False) return ar_res, ols_res