def grid_search_best_model_timeseries_ar(df, grid, cv): best_param = None best_score = np.infty tsp = TimeSeriesSplit(n_splits=cv) for param in grid.get('lags'): scores = [] for train_ind, test_ind in tsp.split(df): train_data = df.iloc[train_ind] test_data = df.iloc[test_ind] try: #print(train_data, test_data) estimator = ar_model.AutoReg(train_data, lags=param) res = estimator.fit() #print(res.params) #get out of sample predictions with test data start and end pred = estimator.predict(res.params, test_data.index[0], test_data.index[-1]) #print(pred) y_pred = pred.values.reshape(-1) y_test = test_data.values.reshape(-1) score = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) scores.append(score) except: pass #print(scores) if len(scores) > 0 and np.mean(scores) < best_score: best_score = np.mean(scores) best_param = param if best_param is not None: estimator = ar_model.AutoReg(df, lags=best_param) res = estimator.fit() print("best parameters:" + str(best_param)) print("validation rmse:" + str(best_score)) #get insample predictions with start and end indices predictions = estimator.predict(res.params, start=0, end=df.shape[0] - 1) y_pred = predictions.values.reshape(-1) y_train = df.values.reshape(-1)[best_param:] train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred)) print("train rmse:" + str(train_rmse)) return estimator, res else: return None, None
#read our petrol data in series = read_csv('petrol_prices.csv', header=0, index_col=0, parse_dates=True, squeeze=True) # split dataset X = difference(series.values) size = int(len(X) * 0.66) train, test = X[0:size], X[size:] # train autoregression window = 10 model = ar_model.AutoReg(train, lags=6) model_fit = model.fit() coef = model_fit.params # walk forward over time steps in test history = [train[i] for i in range(len(train))] predictions = list() for t in range(len(test)): yhat = predict(coef, history) obs = test[t] predictions.append(yhat) history.append(obs) rmse = sqrt(mean_squared_error(test, predictions)) #the mean sq. error print('Test RMSE: %.3f' % rmse)
prepped_dataf['New Cases'] = prepped_dataf['Log Number Cases Delta'] prepped_dataf = prepped_dataf[prepped_dataf['New Cases'].notna()] prepped_dataf = prepped_dataf.replace([np.inf, -np.inf], np.nan) prepped_dataf = prepped_dataf[prepped_dataf['New Cases'].notna()] logged_vals = prepped_dataf['Log Number Cases'] for col in prepped_dataf.keys(): if re.match(r'Log.*', col): print(re.match(r'Log.*', col)) prepped_dataf = prepped_dataf.drop(col, axis=1) with pandas.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also print(prepped_dataf) #prepped_dataf.drop(prepped_dataf.tail(3).index,inplace=True) # drop last n rows model = ar_model.AutoReg(prepped_dataf, lags=2).fit() print(model.summary()) target_len = len(prepped_dataf['New Cases']) - 1 prediction = model.predict(start=205, end=205 + target_len) predictions = [pred for pred in prediction] cases_log_diff_rev = [] for v in predictions: index = predictions.index(v) v_0 = logged_vals[index] v_0 += v cases_log_diff_rev.append(v_0) print(np.exp(cases_log_diff_rev)) start_date = prepped_dataf.index[0] start_date = start_date + datetime.timedelta(days=1) date_list = [
energy_test = energy[energy.index >= test_start_dt] scaler = MinMaxScaler() energy_train['load_scaled'] = scaler.fit_transform(energy_train['load']) energy_train.head(10) #plot both sns.distplot(energy_train['load']) sns.distplot(energy_train['load_scaled']) energy_train1 = energy_train.copy() energy_train1 = energy_train1.drop('load', axis=1) energy_train1.index.freq = 'H' #build model estimator = ar_model.AutoReg(energy_train1, lags=5) res = estimator.fit() print(res.params) energy_validation['load_scaled'] = scaler.fit_transform(energy_validation) energy_validation1 = energy_validation.copy() energy_validation1 = energy_validation1.drop('load', axis=1) energy_validation1.index.freq = 'H' #valdiation error pred = estimator.predict(res.params, energy_validation1.index[0], energy_validation1.index[-1]) print(pred) y_pred = pred.values.reshape(-1) y_test = energy_validation1.values.reshape(-1) score = math.sqrt(metrics.mean_squared_error(y_test, y_pred))
import os from sklearn import metrics from sklearn.model_selection import TimeSeriesSplit from statsmodels.tsa import ar_model path = 'F:/' df = pd.read_csv(os.path.join(path, 'uk-deaths-from-bronchitis-emphys.csv')) df.info() df.columns = ['timestamp', 'y'] df.index = pd.to_datetime(df['timestamp'], format='%Y-%m').copy() df.index.freq = 'MS' df.drop('timestamp', axis=1, inplace=True) #build model estimator = ar_model.AutoReg(df, lags=5) res = estimator.fit() print(res.params) print(res.model) print(res.summary()) #using model predictions = estimator.predict(res.params, start=0, end=df.shape[0] - 1) print(predictions) y_pred = predictions.values.reshape(-1) y_train = df.values.reshape(-1)[5:] train_rmse = math.sqrt(metrics.mean_squared_error(y_train, y_pred)) print(train_rmse) #evaluate model tsp = TimeSeriesSplit(n_splits=3)
def train( data: np.ndarray, used_model: str = "autoreg", p: int = 5, d: int = 1, q: int = 0, cov_type="nonrobust", method="cmle", trend="nc", solver="lbfgs", maxlag=13, # SARIMAX args seasonal=(0, 0, 0, 0), ) -> Any: """Autoregressive model from statsmodels library. Only univariate data. Args: data (np.ndarray): Time series data. used_model (str, optional): Used model. Defaults to "autoreg". p (int, optional): Order of ARIMA model (1st - proportional). Check statsmodels docs for more. Defaults to 5. d (int, optional): Order of ARIMA model. Defaults to 1. q (int, optional): Order of ARIMA model. Defaults to 0. cov_type: Parameters of model call or fit function of particular model. Check statsmodels docs for more. Defaults to 'nonrobust'. method: Parameters of model call or fit function of particular model. Check statsmodels docs for more. Defaults to 'cmle'. trend: Parameters of model call or fit function of particular model. Check statsmodels docs for more. Defaults to 'nc'. solver: Parameters of model call or fit function of particular model. Check statsmodels docs for more. Defaults to 'lbfgs'. maxlag: Parameters of model call or fit function of particular model. Check statsmodels docs for more. Defaults to 13. seasonal: Parameters of model call or fit function of particular model. Check statsmodels docs for more. Defaults to (0, 0, 0, 0). Returns: statsmodels.model: Trained model. """ import statsmodels.tsa.api as sm from statsmodels.tsa.statespace.sarimax import SARIMAX from statsmodels.tsa.arima.model import ARIMA from statsmodels.tsa import ar_model used_model = used_model.lower() if used_model == "ar": model = sm.AR(data) fitted_model = model.fit(method=method, trend=trend, solver=solver, disp=0) elif used_model == "arima": order = (p, d, q) model = ARIMA(data, order=order) fitted_model = model.fit() elif used_model == "sarimax": order = (p, d, q) model = SARIMAX(data, order=order, seasonal_order=seasonal) fitted_model = model.fit(method=method, trend=trend, solver=solver, disp=0) elif used_model == "autoreg": auto = ar_model.ar_select_order(data, maxlag=maxlag) model = ar_model.AutoReg( data, lags=auto.ar_lags, trend=auto.trend, seasonal=auto.seasonal, period=auto.period, ) fitted_model = model.fit(cov_type=cov_type) else: raise ValueError( f"Used model has to be one of ['ar', 'arima', 'sarimax', 'autoreg']. You configured: {used_model}" ) setattr(fitted_model, "my_name", used_model) setattr(fitted_model, "data_length", len(data)) return fitted_model