def test_oob_sarimax(): xreg = rs.rand(wineind.shape[0], 2) fit = ARIMA(order=(1, 1, 1), seasonal_order=(0, 1, 1, 12), out_of_sample_size=15).fit(y=wineind, exogenous=xreg) fit_no_oob = ARIMA(order=(1, 1, 1), seasonal_order=(0, 1, 1, 12), out_of_sample_size=0, suppress_warnings=True).fit(y=wineind[:-15], exogenous=xreg[:-15, :]) # now assert some of the same things here that we did in the former test oob = fit.oob() # compare scores: scoring = get_callable(fit_no_oob.scoring, VALID_SCORING) no_oob_preds = fit_no_oob.predict(n_periods=15, exogenous=xreg[-15:, :]) assert np.allclose(oob, scoring(wineind[-15:], no_oob_preds), rtol=1e-2) # show params are still the same assert np.allclose(fit.params(), fit_no_oob.params(), rtol=1e-2) # show we can add the new samples and get the exact same forecasts xreg_test = rs.rand(5, 2) fit_no_oob.add_new_observations(wineind[-15:], xreg[-15:, :]) assert np.allclose(fit.predict(5, xreg_test), fit_no_oob.predict(5, xreg_test), rtol=1e-2) # Show we can get a confidence interval out here preds, conf = fit.predict(5, xreg_test, return_conf_int=True) assert all(isinstance(a, np.ndarray) for a in (preds, conf))
def test_with_oob(): # show we can fit with CV (kinda) arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, scoring='mse', out_of_sample_size=10).fit(y=hr) oob = arima.oob() assert not np.isnan(oob) # show this works # Assert the predictions give the expected MAE/MSE oob_preds = arima.oob_preds_ assert oob_preds.shape[0] == 10 scoring = val.get_scoring_metric('mse') assert scoring(hr[-10:], oob_preds) == oob # show we can fit if ooss < 0 and oob will be nan arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=-1).fit(y=hr) assert np.isnan(arima.oob()) # This will raise since n_steps is not an int with pytest.raises(TypeError): arima.predict(n_periods="5") # But that we CAN forecast with an int... _ = arima.predict(n_periods=5) # noqa: F841 # Show we fail if cv > n_samples with pytest.raises(ValueError): ARIMA(order=(2, 1, 2), out_of_sample_size=1000).fit(hr)
def test_basic_arima(): arima = ARIMA(order=(0, 0, 0), suppress_warnings=True) preds = arima.fit_predict(y) # fit/predict for coverage # test some of the attrs assert_almost_equal(arima.aic(), 11.201308403566909, decimal=5) assert_almost_equal(arima.aicc(), 11.74676, decimal=5) assert_almost_equal(arima.bic(), 13.639060053303311, decimal=5) # get predictions expected_preds = np.array([ 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876 ]) # generate predictions assert_array_almost_equal(preds, expected_preds) # Make sure we can get confidence intervals expected_intervals = np.array([[-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139]]) _, intervals = arima.predict(n_periods=10, return_conf_int=True, alpha=0.05) assert_array_almost_equal(intervals, expected_intervals)
def test_more_elaborate(): # show we can fit this with a non-zero order arima = ARIMA(order=(2, 1, 2), suppress_warnings=True).fit(y=hr) _try_get_attrs(arima) # can we fit this same arima with a made-up exogenous array? xreg = rs.rand(hr.shape[0], 4) arima = ARIMA(order=(2, 1, 2), suppress_warnings=True).fit(y=hr, exogenous=xreg) _try_get_attrs(arima) # pickle this for the __get/setattr__ coverage. # since the only time this is tested is in parallel in auto.py, # this doesn't actually get any coverage proof... fl = 'some_temp_file.pkl' with open(fl, 'wb') as p: pickle.dump(arima, p) # show we can predict with this even though it's been pickled new_xreg = rs.rand(5, 4) _preds = arima.predict(n_periods=5, exogenous=new_xreg) # now unpickle with open(fl, 'rb') as p: other = pickle.load(p) # show we can still predict, compare _other_preds = other.predict(n_periods=5, exogenous=new_xreg) assert_array_almost_equal(_preds, _other_preds) # now remove the pickle file os.unlink(fl) # now show that since we fit the ARIMA with an exogenous array, # we need to provide one for predictions otherwise it breaks. with pytest.raises(ValueError): arima.predict(n_periods=5, exogenous=None) # show that if we DO provide an exogenous and it's the wrong dims, we # also break things down. with pytest.raises(ValueError): arima.predict(n_periods=5, exogenous=rs.rand(4, 4))
def test_with_seasonality1(): fit = ARIMA(order=(1, 1, 1), seasonal_order=(0, 1, 1, 12), suppress_warnings=True).fit(y=wineind) _try_get_attrs(fit) # R code AIC result is ~3004 assert abs(fit.aic() - 3004) < 100 # show equal within 100 or so # R code AICc result is ~3005 assert abs(fit.aicc() - 3005) < 100 # show equal within 100 or so # R code BIC result is ~3017 assert abs(fit.bic() - 3017) < 100 # show equal within 100 or so # show we can predict in-sample fit.predict_in_sample() # test with SARIMAX confidence intervals fit.predict(n_periods=10, return_conf_int=True, alpha=0.05)
def test_oob_for_issue_29(self, d, cv, exog): model = ARIMA(order=(2, d, 0), out_of_sample_size=cv).fit(self.dta, exogenous=exog) # If exogenous is defined, we need to pass n_periods of # exogenous rows to the predict function. Otherwise we'll # just leave it at None if exog is not None: xr = exog[:3, :] else: xr = None _, _ = model.predict(n_periods=3, return_conf_int=True, exogenous=xr)
def test_oob_sarimax(): xreg = rs.rand(wineind.shape[0], 2) fit = ARIMA(order=(1, 1, 1), seasonal_order=(0, 1, 1, 12), maxiter=5, out_of_sample_size=15).fit(y=wineind, exogenous=xreg) fit_no_oob = ARIMA(order=(1, 1, 1), seasonal_order=(0, 1, 1, 12), out_of_sample_size=0, maxiter=5, suppress_warnings=True).fit(y=wineind[:-15], exogenous=xreg[:-15, :]) # now assert some of the same things here that we did in the former test oob = fit.oob() # compare scores: scoring = val.get_scoring_metric(fit_no_oob.scoring) no_oob_preds = fit_no_oob.predict(n_periods=15, exogenous=xreg[-15:, :]) assert np.allclose(oob, scoring(wineind[-15:], no_oob_preds), rtol=1e-2) # show params are no longer the same assert not np.allclose(fit.params(), fit_no_oob.params(), rtol=1e-2) # show we can add the new samples and get the exact same forecasts xreg_test = rs.rand(5, 2) fit_no_oob.update(wineind[-15:], xreg[-15:, :]) assert np.allclose(fit.predict(5, xreg_test), fit_no_oob.predict(5, xreg_test), rtol=1e-2) # And also the params should be close now after updating assert np.allclose(fit.params(), fit_no_oob.params()) # Show we can get a confidence interval out here preds, conf = fit.predict(5, xreg_test, return_conf_int=True) assert all(isinstance(a, np.ndarray) for a in (preds, conf))
def test_oob_for_issue_29(): dta = sm.datasets.sunspots.load_pandas().data dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008')) del dta["YEAR"] xreg = np.random.RandomState(1).rand(dta.shape[0], 3) # Try for cv on/off, various D levels, and various Xregs for d in (0, 1): for cv in (0, 3): for exog in (xreg, None): # surround with try/except so we can log the failing combo try: model = ARIMA(order=(2, d, 0), out_of_sample_size=cv).fit(dta, exogenous=exog) # If exogenous is defined, we need to pass n_periods of # exogenous rows to the predict function. Otherwise we'll # just leave it at None if exog is not None: xr = exog[:3, :] else: xr = None _, _ = model.predict(n_periods=3, return_conf_int=True, exogenous=xr) except Exception as ex: print("Failing combo: d=%i, cv=%i, exog=%r" % (d, cv, exog)) # Statsmodels can be fragile with ARMA coefficient # computation. If we encounter that, pass: # ValueError: The computed initial MA coefficients are # not invertible. You should induce invertibility, # choose a different model order, or ... if "invertibility" in str(ex): pass else: raise
def test_basic_arma(): arma = ARIMA(order=(0, 0, 0), suppress_warnings=True) preds = arma.fit_predict(y) # fit/predict for coverage # No OOB, so assert none assert arma.oob_preds_ is None # test some of the attrs assert_almost_equal(arma.aic(), 11.201, decimal=3) # equivalent in R # intercept is param 0 intercept = arma.params()[0] assert_almost_equal(intercept, 0.441, decimal=3) # equivalent in R assert_almost_equal(arma.aicc(), 11.74676, decimal=5) assert_almost_equal(arma.bic(), 13.639060053303311, decimal=5) # get predictions expected_preds = np.array([0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876]) # generate predictions assert_array_almost_equal(preds, expected_preds) # Make sure we can get confidence intervals expected_intervals = np.array([ [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139], [-0.10692387, 0.98852139] ]) _, intervals = arma.predict(n_periods=10, return_conf_int=True, alpha=0.05) assert_array_almost_equal(intervals, expected_intervals)
def test_with_oob(): # show we can fit with CV (kinda) arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=10).fit(y=hr) assert not np.isnan(arima.oob()) # show this works # show we can fit if ooss < 0 and oob will be nan arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=-1).fit(y=hr) assert np.isnan(arima.oob()) # This will raise since n_steps is not an int assert_raises(TypeError, arima.predict, n_periods="5") # But that we CAN forecast with an int... _ = arima.predict(n_periods=5) # noqa: F841 # Show we fail if cv > n_samples assert_raises(ValueError, ARIMA(order=(2, 1, 2), out_of_sample_size=1000).fit, hr)
def ragged_fill_series( series, function=np.nanmean, backup_fill_method=np.nanmean, est_series=None, fitted_arma=None, arma_full_series=None, ): """Filling in the ragged ends of a series, adhering to the periodicity of the series. If there is only one observation and periodicity cannot be determined, series will be returned unchanged. parameters: :series: list/pandas Series: the series to fill the ragged edges of. Missings should be np.nans :function: the function to fill nas with (e.g. np.nanmean, etc.). Use "ARMA" for ARMA filling :backup_fill_method: function: which function to fill ragged edges with in case ARMA can't be estimated :est_series: list/pandas Series: optional, the series to calculate the fillna and/or ARMA function on. Should not have nas filled in yet by any method. E.g. a train set. If None, will calculated based on itself. :fitted_arma: optional, fitted ARMA model if available to avoid reestimating every time in the `gen_ragged_X` function :arma_full_series: optional, for_full_arma_dataset output of `gen_dataset` function. Fitting the ARMA model on the full series history rather than just the series provided output: :return: pandas Series with filled ragged edges """ result = pd.Series(series).copy() if est_series is None: est_series = result.copy() # periodicity of the series, to see which to fill in nonna_bools = ~pd.isna(series) nonna_indices = list( nonna_bools.index[nonna_bools]) # existing indices with values # if there is only one non-na observation, can't determine periodicity or position in full series, don't fill anything if len(nonna_indices) > 1: periodicity = int( (pd.Series(result[~pd.isna(result)].index) - (pd.Series(result[~pd.isna(result)].index)).shift() ).mode()[0]) # how often data comes (quarterly, monthly, etc.) last_nonna = result.index[result.notna()][-1] fill_indices = nonna_indices + [ int(nonna_indices[-1] + periodicity * i) for i in range(1, (len(series) - last_nonna)) ] # indices to be filled in, including only the correct periodicity fill_indices = [x for x in fill_indices if x in series.index ] # cut down on the indices if went too long if function == "ARMA": # estimate the model if not given if fitted_arma is None: fitted_arma = estimate_arma(est_series) # instantiate model with previously estimated parameters (i.e. on train set) arma = ARIMA(order=fitted_arma.order) arma.set_params(**fitted_arma.get_params()) # refit the model on the full series to this point if arma_full_series is not None: y = list(arma_full_series[~pd.isna(arma_full_series)]) present = list(result[~pd.isna(result)]) # limit the series to the point where actuals are end_index = 0 for i in range(len(present), len(y) + 1): if list(y[(i - len(present)):i]) == list(present): end_index = i y = y[:end_index] # refit model on just this series else: y = list(result[~pd.isna(result)]) # refit the model on data present = y.copy() # can fail if not enough datapoints for order of ARMA process try: arma.fit(y, error_action="ignore") preds = arma.predict(n_periods=int(len(series) - last_nonna)) fills = list(present) + list(preds) fills = fills[:len(fill_indices)] except: fills = list(result[~pd.isna(result)]) + [ backup_fill_method(est_series) ] * (len(series) - last_nonna) fills = fills[:len(fill_indices)] result[fill_indices] = fills else: fills = list(result[~pd.isna(result)]) + [function(est_series)] * ( len(series) - last_nonna) fills = fills[:len(fill_indices)] result[fill_indices] = fills return result, fitted_arma
def test_oob_for_issue_28(): # Continuation of above: can we do one with an exogenous array, too? xreg = rs.rand(hr.shape[0], 4) arima = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=10).fit(y=hr, exogenous=xreg) oob = arima.oob() assert not np.isnan(oob) # Assert that the endog shapes match. First is equal to the original, # and the second is the differenced array, with original shape - d. assert np.allclose(arima.arima_res_.data.endog, hr, rtol=1e-2) assert arima.arima_res_.model.endog.shape[0] == hr.shape[0] - 1 # Now assert the same for exog assert np.allclose(arima.arima_res_.data.exog, xreg, rtol=1e-2) assert arima.arima_res_.model.exog.shape[0] == xreg.shape[0] - 1 # Compare the OOB score to an equivalent fit on data - 10 obs, but # without any OOB scoring, and we'll show that the OOB scoring in the # first IS in fact only applied to the first (train - n_out_of_bag) # samples arima_no_oob = ARIMA(order=(2, 1, 2), suppress_warnings=True, out_of_sample_size=0).fit(y=hr[:-10], exogenous=xreg[:-10, :]) scoring = get_callable(arima_no_oob.scoring, VALID_SCORING) preds = arima_no_oob.predict(n_periods=10, exogenous=xreg[-10:, :]) assert np.allclose(oob, scoring(hr[-10:], preds), rtol=1e-2) # Show that the model parameters are exactly the same xreg_test = rs.rand(5, 4) assert np.allclose(arima.params(), arima_no_oob.params(), rtol=1e-2) # Now assert on the forecast differences. with_oob_forecasts = arima.predict(n_periods=5, exogenous=xreg_test) no_oob_forecasts = arima_no_oob.predict(n_periods=5, exogenous=xreg_test) assert_raises(AssertionError, assert_array_almost_equal, with_oob_forecasts, no_oob_forecasts) # But after we update the no_oob model with the latest data, we should # be producing the same exact forecasts # First, show we'll fail if we try to add observations with no exogenous assert_raises(ValueError, arima_no_oob.add_new_observations, hr[-10:], None) # Also show we'll fail if we try to add mis-matched shapes of data assert_raises(ValueError, arima_no_oob.add_new_observations, hr[-10:], xreg_test) # Show we fail if we try to add observations with a different dim exog assert_raises(ValueError, arima_no_oob.add_new_observations, hr[-10:], xreg_test[:, :2]) # Actually add them now, and compare the forecasts (should be the same) arima_no_oob.add_new_observations(hr[-10:], xreg[-10:, :]) assert np.allclose(with_oob_forecasts, arima_no_oob.predict(n_periods=5, exogenous=xreg_test), rtol=1e-2)
class ARIMAModel(BaseModel): def __init__(self): """ Initialize Model """ self.seasonal = True self.metric = 'mse' self.model = None self.model_init = False def _build(self, **config): """ build the models and initialize. :param config: hyperparameters for the model """ p = config.get('p', 2) d = config.get('d', 0) q = config.get('q', 2) self.seasonal = config.get('seasonality_mode', True) P = config.get('P', 1) D = config.get('D', 0) Q = config.get('Q', 1) m = config.get('m', 7) self.metric = config.get('metric', self.metric) order = (p, d, q) if not self.seasonal: seasonal_order = (0, 0, 0, 0) else: seasonal_order = (P, D, Q, m) self.model = ARIMA(order=order, seasonal_order=seasonal_order, suppress_warnings=True) def fit_eval(self, data, validation_data, **config): """ Fit on the training data from scratch. :param data: A 1-D numpy array as the training data :param validation_data: A 1-D numpy array as the evaluation data :return: the evaluation metric value """ if not self.model_init: # Estimating differencing term (d) and seasonal differencing term (D) kpss_diffs = ndiffs(data, alpha=0.05, test='kpss', max_d=6) adf_diffs = ndiffs(data, alpha=0.05, test='adf', max_d=6) d = max(adf_diffs, kpss_diffs) D = 0 if not self.seasonal else nsdiffs(data, m=7, max_D=12) config.update(d=d, D=D) self._build(**config) self.model_init = True self.model.fit(data) val_metric = self.evaluate(x=None, target=validation_data, metrics=[self.metric])[0].item() return {self.metric: val_metric} def predict(self, x=None, horizon=24, update=False, rolling=False): """ Predict horizon time-points ahead the input x in fit_eval :param x: ARIMA predicts the horizon steps foreward from the training data. So x should be None as it is not used. :param horizon: the number of steps forward to predict :param update: whether to update the original model :param rolling: whether to use rolling prediction :return: predicted result of length horizon """ if x is not None: raise ValueError("x should be None") if update and not rolling: raise Exception( "We don't support updating model without rolling prediction currently" ) if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling predict" ) if not update and not rolling: forecasts = self.model.predict(n_periods=horizon) elif rolling: if not update: self.save("tmp.pkl") forecasts = [] for step in range(horizon): fc = self.model.predict(n_periods=1).item() forecasts.append(fc) # Updates the existing model with a small number of MLE steps for rolling prediction self.model.update(fc) if not update: self.restore("tmp.pkl") os.remove("tmp.pkl") return forecasts def evaluate(self, target, x=None, metrics=['mse'], rolling=False): """ Evaluate on the prediction results and y. We predict horizon time-points ahead the input x in fit_eval before evaluation, where the horizon length equals the second dimension size of y. :param target: target for evaluation. :param x: ARIMA predicts the horizon steps foreward from the training data. So x should be None as it is not used. :param metrics: a list of metrics in string format :param rolling: whether to use rolling prediction :return: a list of metric evaluation results """ if x is not None: raise ValueError("We don't support input x currently") if target is None: raise ValueError("Input invalid target of None") if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling evaluate" ) forecasts = self.predict(horizon=len(target), rolling=rolling) return [Evaluator.evaluate(m, target, forecasts) for m in metrics] def save(self, checkpoint_file): if self.model is None: raise Exception( "Needs to call fit_eval or restore first before calling save") with open(checkpoint_file, 'wb') as fout: pickle.dump(self.model, fout) def restore(self, checkpoint_file): with open(checkpoint_file, 'rb') as fin: self.model = pickle.load(fin) self.model_init = True
class Model: def select_data(self): # Merge columns into a single dataframe of observed values, based on date dataset = files.data_main.join(files.data_exo.set_index('date'), on='date').dropna() # Select part of the precipitation dataframe that corresponds to the forecast obs_end = dataset.tail(1)['date'].values[0] exo_prev = files.data_exo[(files.data_exo['date'] > obs_end)] # Select predict dates self.dates_prev = exo_prev['date'] # Reshape endo_obs = np.array(dataset['endo_value']) self.endo_obs = endo_obs.reshape(-1, 1) exo_obs = np.array(dataset['exo_value']) self.exo_obs = exo_obs.reshape(-1, 1) exo_prev = np.array(exo_prev['exo_value']) self.exo_prev = exo_prev.reshape(-1, 1) def normalize(self): # Calculate lambda only if doesn't have zero values n_zeros = len(self.endo_obs[self.endo_obs <= 0]) if n_zeros == 0: self.endo_obs2, self.lambda_boxcox = boxcox(self.endo_obs) else: self.lambda_boxcox = -999 # Limit lambda values if abs(self.lambda_boxcox[0]) > 1: self.endo_obs2 = self.endo_obs self.lambda_boxcox = -999 #print(self.endo_obs2, self.lambda_boxcox) def run_auto(self): self.arima_model = auto_arima(self.endo_obs2, start_p=0, start_d=0, start_q=0, max_p=3, max_d=1, max_q=3, start_P=0, start_Q=0, D=1, seasonal=False, m=1, exogeneous=self.exo_obs, trace=True, error_action='ignore', suppress_warnings=True, stepwise=True) #print(model.arima_model.summary()) # Compile parameters to list self.parameters = [ self.arima_model.order, self.arima_model.seasonal_order, self.lambda_boxcox[0], self.arima_model.aic() ] print(self.parameters) return (self.arima_model) def run_auto_arimax(self): lower_aic = float(99999) best_pdq = [0, 0, 0] param = list(itertools.product(range(0, 4), range(0, 2), range(0, 4))) for pdq in param: #print(pdq) try: self.arima_model = ARIMA(order=pdq, suppress_warnings=True).fit( y=self.endo_obs2, exogenous=self.exo_obs) if self.arima_model.aic() < lower_aic: lower_aic = self.arima_model.aic() best_pdq = tuple(self.arima_model.order) except: continue #print(model.arima_model.summary()) # Compile parameters to list self.parameters = [best_pdq, self.lambda_boxcox[0], lower_aic] print(self.parameters) return (self.arima_model) def run_auto_sarimax(self): lower_aic = float(99999) best_pdq = [0, 0, 0] best_spdq = [0, 0, 0] param = list(itertools.product(range(0, 4), range(0, 2), range(0, 4))) m = 1 # frequency param_seasonal = [(x[0], x[1], x[2], m) for x in list( itertools.product(range(0, 4), range(0, 2), range(0, 4)))] for pdq in param: for spdq in param_seasonal: try: mod = sm.tsa.statespace.SARIMAX( self.endo_obs2, exog=self.exo_obs, order=pdq, seasonal_order=spdq, enforce_stationarity=False, enforce_invertibility=False) self.arima_model = mod.fit(disp=0) print('ARIMA{}x{}{} - AIC:{}'.format( pdq, spdq, m, self.arima_model.aic)) if self.arima_model.aic() < lower_aic: lower_aic = self.arima_model.aic() best_pdq = tuple(pdq) best_spdq = tuple(spdq) except: continue #print(model.arima_model.summary()) # Compile parameters to list self.parameters = [ best_pdq, best_spdq, self.lambda_boxcox[0], lower_aic ] print(self.parameters) return (self.arima_model) def forecast(self): #self.predict = self.arima_model.predict(n_periods=self.exo_prev.shape[0], exogenous=self.exo_prev) self.predict = self.arima_model.predict( n_periods=self.exo_prev.shape[0], exogenous=self.exo_prev, return_conf_int=True, alpha=0.7) def renormalize(self): if self.lambda_boxcox == float(-999): self.predict_mean = self.predict[0] self.predict_down = self.predict[1][:, 0] self.predict_up = self.predict[1][:, 1] else: self.predict_mean = inv_boxcox(self.predict[0], self.lambda_boxcox) self.predict_down = inv_boxcox(self.predict[1][:, 0], self.lambda_boxcox) self.predict_up = inv_boxcox(self.predict[1][:, 1], self.lambda_boxcox) # Join predict dates with values into a dataframe df_final = pd.DataFrame(self.predict_mean, self.dates_prev) df_final.columns = ['endo_value'] return (df_final)
# Plot Residuals and fitted values # plt.figure() # fitted_values = arima.predict_in_sample() # plt.plot(df.index[:train_len - 1], fitted_values, # color='C0', label="Fitted values") # plt.plot(pd.to_datetime(df.index), data, color='C1', label="Data") # plt.plot(df.index[:train_len - 1], arima.resid(), # color='C2', label="Residuals") # plt.gca().grid(which='both', axis='x', linestyle='--') # plt.title("Residuals and fitted values") # plt.legend() print("SSE: {}".format((arima.resid()**2).sum())) # Plot fitted values and forecasts predictions = arima.predict(n_periods=test.shape[0]) fitted_values = arima.predict_in_sample() plt.figure() plt.plot(df.index[train_len:], test, '--', color='C0', label="test set") plt.plot(df.index[train_len:], predictions, '--', color='C1', label="forecasted values") plt.plot(df.index[:train_len], train, color='C0', label="train set") plt.plot(df.index[:train_len - 1], fitted_values, color='C1', label="fitted values") plt.legend() plt.title("Fitted values and forecasts")
def predict_arima(df): time_in=current_milli_time() try: forecast_in = open("forecast.pickle","rb") future_forecast = pickle.load(forecast_in) forecast_in.append(df) error=[] """ Calculate errors """ if len(df) < len(future_forecast): error=df["memory_used"] - future_forecast[:len(df)]["memory_used"] elif len(df) > len(future_forecast): error=df[0:len(future_forecast)]["memory_used"]- future_forecast["memory_used"] else: error=df["memory_used"]-future_forecast["memory_used"] overestimation=[x for x in error if x<0] overestimation=sum(overestimation)/len(overestimation) underestimation=[x for x in error if x>=0] underestimation=sum(underestimation)/len(underestimation) print("UNDERESTIMATION ERROR: "+underestimation) print("OVERESTIMATION ERROR: "+overestimation) print("Mean Absolute Error in Last iteration "+str(error)) """ Overestimation & Underestimation errors """ except Exception as e: print("RMSE To be computed") # Do Nothing try: pm.plot_pacf(df,show=False).savefig('pacf.png') pm.plot_acf(df,show=False).savefig('acf.png') except: print("Data points insufficient for ACF & PACF") try: pickle_in = open("arima.pickle","rb") arima_data = pickle.load(pickle_in) arima_data.append(df) #df=arima_data except Exception as e: arima_data_out = open("arima.pickle","wb") pickle.dump([], arima_data_out) arima_data_out = open("arima.pickle","wb") pickle.dump(df, arima_data_out) arima_data_out.close() ''' tests ''' nd=1 nsd=1 try: adf_test=ADFTest(alpha=0.05) p_val, should_diff = adf_test.is_stationary(df["memory_used"]) nd = ndiffs(df, test='adf') logging.info(nd) nsd = nsdiffs(df,12) logging.info(nd) except: nd=1 print("Exception on tests") ch_test=CHTest(12) try: nsd=ch_test.estimate_seasonal_differencing_term(df) except Exception as e: print(e) logging.error(e) ''' ARIMA MODEL ''' ''' Find p,q dynamically ''' acf_lags=acf(df["memory_used"]) acf_lags_threshold=[x for x in acf_lags if x>=getThreshold()] p=len(acf_lags_threshold) if len(acf_lags_threshold)<=4 else 4 pacf_lags=pacf(df["memory_used"]) pacf_lags_threshold=[x for x in pacf_lags if x>=getThreshold()] q=len(pacf_lags_threshold) if len(pacf_lags_threshold)<=1 else 1 d=nd train, test = train_test_split(df,shuffle=False, test_size=0.3) # If data is seasonal set the values of P,D,Q in seasonal order stepwise_model = ARIMA( order=(p,d,q), seasonal_order=(0,nsd,0,12), suppress_warnings=True, scoring='mse' ) x=str(p)+" "+str(nd)+" "+str(q) print("Model with p="+str(q)+" d="+str(d)+" q="+str(q)) try: stepwise_model.fit(df) """ Vary the periods as per the forecasting window n_periods= 30 = 5mins n_periods= 60 = 10mins n_periods= 90 = 15mins """ future_forecast = stepwise_model.predict(n_periods=len(test)) future_forecast = pd.DataFrame(future_forecast,index=test.index,columns=["prediction"]) res=pd.concat([df,future_forecast],axis=1) ''' Save Forecast in Pickle ''' forecast_out = open("forecast.pickle","wb") pickle.dump(future_forecast,forecast_out) forecast_out.close() trace1 = go.Scatter(x=res.index, y=res["prediction"],name="Prediction", mode='lines') trace2 = go.Scatter(x=df.index, y=df["memory_used"],name="DF data", mode='lines') data=[trace1,trace2] layout = go.Layout( title=x ) fig = go.Figure(data=data, layout=layout) plot(fig, filename="prediction") print("Current values") print(df) print("Predicted Data Points") print(future_forecast) time_out=current_milli_time() print("TIME for RNN(ms):"+str(time_out-time_in)) return future_forecast except Exception as e: time_out=current_milli_time() print("TIME for RNN(ms):"+str(time_out-time_in)) print(e) return None
def model_plot(days): days = int(days) pd.plotting.register_matplotlib_converters() df = pd.read_csv('data/new_york.csv') df['Date'] = pd.to_datetime(df['Date']) #converting data to daily usage. df.index = df.Date df = df.drop('Date', axis=1) # resample the dataframe every 1 day (D) and sum ovr each day df = df.resample('D').sum() df = df.tz_localize(None) nyc_weather = pd.read_csv('data/weather/weatherNY.csv') nyc_weather['DATE'] = pd.to_datetime(nyc_weather['DATE']) nyc_weather = nyc_weather.set_index('DATE') nyc_weather.drop(['NAME','STATION'],axis=1,inplace=True) nyc_weather = nyc_weather['2015-07-01':'2020-08-10'] df = df[:'2020-08-10'] #trying 1 day increments with EXOG. MAYBE BEST CANDIDATE? with fourier terms june to june as 638 and august to august 516 day = days real_values = [] predictions = [] df1 = df["2016":"2019"] nyc_weather = nyc_weather["2016":"2019"] y = df1.Consumption exog = pd.DataFrame({'date': y.index}) exog = exog.set_index(pd.PeriodIndex(exog['date'], freq='D')) exog['is_weekend'] = np.where(exog.index.dayofweek < 5,0,1) #add weather data exog['TMIN'] = nyc_weather['TMIN'].values exog['sin1'] = np.sin(2 * np.pi * exog.index.dayofyear / 638) exog['cos1'] = np.cos(2 * np.pi * exog.index.dayofyear / 638) exog['sin2'] = np.sin(4 * np.pi * exog.index.dayofyear /638) exog['cos2'] = np.cos(4 * np.pi * exog.index.dayofyear /638) exog['sin3'] = np.sin(2 * np.pi * exog.index.dayofyear / 516) exog['cos3'] = np.cos(2 * np.pi * exog.index.dayofyear / 516) exog['sin4'] = np.sin(4 * np.pi * exog.index.dayofyear /516) exog['cos4'] = np.cos(4 * np.pi * exog.index.dayofyear /516) exog = exog.drop(columns=['date']) num_to_update = 0 y_to_train = y.iloc[:(len(y)-100)] exog_to_train = exog.iloc[:(len(y)-100)] dates = [] steps = [] for i in range(5): #first iteration train the model if i == 0: arima_exog_model = ARIMA(order=(3, 0, 1), seasonal_order=(2, 0, 0, 7),exogenous=exog_to_train, error_action='ignore', initialization='approximate_diffuse', suppress_warnings=True).fit(y=y_to_train) preds = arima_exog_model.predict_in_sample(exog_to_train) #first prediction y_to_test = y.iloc[(len(y)-100):(len(y)-100+day)] y_exog_to_test = exog.iloc[(len(y)-100):(len(y)-100+day)] y_arima_exog_forecast = arima_exog_model.predict(n_periods=day, exogenous=y_exog_to_test) real_values.append(y_to_test.values) predictions.append(y_arima_exog_forecast.tolist()) dates.append(y_to_test.index) steps.append(y_to_test.index[-1]) #y_arima_exog_forecast = arima_exog_model.predict(n_periods=2, exogenous=exog_to_test) else: y_to_update = y.iloc[(len(y)-100+num_to_update):(len(y)-100+num_to_update)+day] exog_to_update = exog.iloc[(len(y)-100+num_to_update):(len(y)-100+num_to_update)+day] #to test to_test = y.iloc[(len(y)-100+num_to_update)+day:(len(y)-100+num_to_update)+(day*2)] exog_to_test = exog.iloc[(len(y)-100+num_to_update)+day:(len(y)-100+num_to_update)+(day*2)] #update the model arima_exog_model.update(y_to_update,exogenous=exog_to_update) y_arima_exog_forecast = arima_exog_model.predict(n_periods=day, exogenous=exog_to_test) dates.append(to_test.index) steps.append(to_test.index[-1]) predictions.append(y_arima_exog_forecast.tolist()) real_values.append(to_test.values) num_to_update += day predict = [item for sublist in predictions for item in sublist] true = [item for sublist in real_values for item in sublist] dates = [item for sublist in dates for item in sublist] #for viz purposes y_to_train2 = y_to_train[-200:] preds = preds[-200:] y_to_train2 = y_to_train2.to_frame() fig = go.Figure() # Create and style traces fig.add_trace(go.Scatter(x=y_to_train2.index, y=y_to_train2.Consumption, name='True values', line=dict(color='firebrick', width=4,dash='dot'))) fig.add_trace(go.Scatter(x=y_to_train2.index, y=preds[-200:], name='In-sample Prediction', line=dict(color='royalblue', width=4))) fig.add_trace(go.Scatter(x=dates, y=predict, name='Prediction', line=dict(color='green', width=4))) fig.add_trace(go.Scatter(x=dates, y=true, name='True', line=dict(color='firebrick', width=4,dash='dot'))) fig.update_layout(title='Electricity Consumption in New York', xaxis_title='Date', yaxis_title='Consumption', xaxis_showgrid=True, yaxis_showgrid=True, #autosize=False, #width=500, #height=500, paper_bgcolor=app_colors['background'], plot_bgcolor=app_colors['background']) return fig