Ejemplo n.º 1
0
def test_oob_sarimax():
    xreg = rs.rand(wineind.shape[0], 2)
    fit = ARIMA(order=(1, 1, 1),
                seasonal_order=(0, 1, 1, 12),
                out_of_sample_size=15).fit(y=wineind, exogenous=xreg)

    fit_no_oob = ARIMA(order=(1, 1, 1),
                       seasonal_order=(0, 1, 1, 12),
                       out_of_sample_size=0,
                       suppress_warnings=True).fit(y=wineind[:-15],
                                                   exogenous=xreg[:-15, :])

    # now assert some of the same things here that we did in the former test
    oob = fit.oob()

    # compare scores:
    scoring = get_callable(fit_no_oob.scoring, VALID_SCORING)
    no_oob_preds = fit_no_oob.predict(n_periods=15, exogenous=xreg[-15:, :])
    assert np.allclose(oob, scoring(wineind[-15:], no_oob_preds), rtol=1e-2)

    # show params are still the same
    assert np.allclose(fit.params(), fit_no_oob.params(), rtol=1e-2)

    # show we can add the new samples and get the exact same forecasts
    xreg_test = rs.rand(5, 2)
    fit_no_oob.add_new_observations(wineind[-15:], xreg[-15:, :])
    assert np.allclose(fit.predict(5, xreg_test),
                       fit_no_oob.predict(5, xreg_test),
                       rtol=1e-2)

    # Show we can get a confidence interval out here
    preds, conf = fit.predict(5, xreg_test, return_conf_int=True)
    assert all(isinstance(a, np.ndarray) for a in (preds, conf))
Ejemplo n.º 2
0
def test_with_oob():
    # show we can fit with CV (kinda)
    arima = ARIMA(order=(2, 1, 2),
                  suppress_warnings=True,
                  scoring='mse',
                  out_of_sample_size=10).fit(y=hr)

    oob = arima.oob()
    assert not np.isnan(oob)  # show this works

    # Assert the predictions give the expected MAE/MSE
    oob_preds = arima.oob_preds_
    assert oob_preds.shape[0] == 10
    scoring = val.get_scoring_metric('mse')
    assert scoring(hr[-10:], oob_preds) == oob

    # show we can fit if ooss < 0 and oob will be nan
    arima = ARIMA(order=(2, 1, 2),
                  suppress_warnings=True,
                  out_of_sample_size=-1).fit(y=hr)
    assert np.isnan(arima.oob())

    # This will raise since n_steps is not an int
    with pytest.raises(TypeError):
        arima.predict(n_periods="5")

    # But that we CAN forecast with an int...
    _ = arima.predict(n_periods=5)  # noqa: F841

    # Show we fail if cv > n_samples
    with pytest.raises(ValueError):
        ARIMA(order=(2, 1, 2), out_of_sample_size=1000).fit(hr)
Ejemplo n.º 3
0
def test_basic_arima():
    arima = ARIMA(order=(0, 0, 0), suppress_warnings=True)
    preds = arima.fit_predict(y)  # fit/predict for coverage

    # test some of the attrs
    assert_almost_equal(arima.aic(), 11.201308403566909, decimal=5)
    assert_almost_equal(arima.aicc(), 11.74676, decimal=5)
    assert_almost_equal(arima.bic(), 13.639060053303311, decimal=5)

    # get predictions
    expected_preds = np.array([
        0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876, 0.44079876,
        0.44079876, 0.44079876, 0.44079876, 0.44079876
    ])

    # generate predictions
    assert_array_almost_equal(preds, expected_preds)

    # Make sure we can get confidence intervals
    expected_intervals = np.array([[-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139],
                                   [-0.10692387, 0.98852139]])

    _, intervals = arima.predict(n_periods=10,
                                 return_conf_int=True,
                                 alpha=0.05)
    assert_array_almost_equal(intervals, expected_intervals)
Ejemplo n.º 4
0
def test_more_elaborate():
    # show we can fit this with a non-zero order
    arima = ARIMA(order=(2, 1, 2), suppress_warnings=True).fit(y=hr)
    _try_get_attrs(arima)

    # can we fit this same arima with a made-up exogenous array?
    xreg = rs.rand(hr.shape[0], 4)
    arima = ARIMA(order=(2, 1, 2), suppress_warnings=True).fit(y=hr,
                                                               exogenous=xreg)
    _try_get_attrs(arima)

    # pickle this for the __get/setattr__ coverage.
    # since the only time this is tested is in parallel in auto.py,
    # this doesn't actually get any coverage proof...
    fl = 'some_temp_file.pkl'
    with open(fl, 'wb') as p:
        pickle.dump(arima, p)

    # show we can predict with this even though it's been pickled
    new_xreg = rs.rand(5, 4)
    _preds = arima.predict(n_periods=5, exogenous=new_xreg)

    # now unpickle
    with open(fl, 'rb') as p:
        other = pickle.load(p)

    # show we can still predict, compare
    _other_preds = other.predict(n_periods=5, exogenous=new_xreg)
    assert_array_almost_equal(_preds, _other_preds)

    # now remove the pickle file
    os.unlink(fl)

    # now show that since we fit the ARIMA with an exogenous array,
    # we need to provide one for predictions otherwise it breaks.
    with pytest.raises(ValueError):
        arima.predict(n_periods=5, exogenous=None)

    # show that if we DO provide an exogenous and it's the wrong dims, we
    # also break things down.
    with pytest.raises(ValueError):
        arima.predict(n_periods=5, exogenous=rs.rand(4, 4))
Ejemplo n.º 5
0
def test_with_seasonality1():
    fit = ARIMA(order=(1, 1, 1),
                seasonal_order=(0, 1, 1, 12),
                suppress_warnings=True).fit(y=wineind)
    _try_get_attrs(fit)

    # R code AIC result is ~3004
    assert abs(fit.aic() - 3004) < 100  # show equal within 100 or so

    # R code AICc result is ~3005
    assert abs(fit.aicc() - 3005) < 100  # show equal within 100 or so

    # R code BIC result is ~3017
    assert abs(fit.bic() - 3017) < 100  # show equal within 100 or so

    # show we can predict in-sample
    fit.predict_in_sample()

    # test with SARIMAX confidence intervals
    fit.predict(n_periods=10, return_conf_int=True, alpha=0.05)
Ejemplo n.º 6
0
    def test_oob_for_issue_29(self, d, cv, exog):
        model = ARIMA(order=(2, d, 0),
                      out_of_sample_size=cv).fit(self.dta, exogenous=exog)

        # If exogenous is defined, we need to pass n_periods of
        # exogenous rows to the predict function. Otherwise we'll
        # just leave it at None
        if exog is not None:
            xr = exog[:3, :]
        else:
            xr = None

        _, _ = model.predict(n_periods=3, return_conf_int=True, exogenous=xr)
Ejemplo n.º 7
0
def test_oob_sarimax():
    xreg = rs.rand(wineind.shape[0], 2)
    fit = ARIMA(order=(1, 1, 1),
                seasonal_order=(0, 1, 1, 12),
                maxiter=5,
                out_of_sample_size=15).fit(y=wineind, exogenous=xreg)

    fit_no_oob = ARIMA(order=(1, 1, 1),
                       seasonal_order=(0, 1, 1, 12),
                       out_of_sample_size=0,
                       maxiter=5,
                       suppress_warnings=True).fit(y=wineind[:-15],
                                                   exogenous=xreg[:-15, :])

    # now assert some of the same things here that we did in the former test
    oob = fit.oob()

    # compare scores:
    scoring = val.get_scoring_metric(fit_no_oob.scoring)
    no_oob_preds = fit_no_oob.predict(n_periods=15, exogenous=xreg[-15:, :])
    assert np.allclose(oob, scoring(wineind[-15:], no_oob_preds), rtol=1e-2)

    # show params are no longer the same
    assert not np.allclose(fit.params(), fit_no_oob.params(), rtol=1e-2)

    # show we can add the new samples and get the exact same forecasts
    xreg_test = rs.rand(5, 2)
    fit_no_oob.update(wineind[-15:], xreg[-15:, :])
    assert np.allclose(fit.predict(5, xreg_test),
                       fit_no_oob.predict(5, xreg_test),
                       rtol=1e-2)

    # And also the params should be close now after updating
    assert np.allclose(fit.params(), fit_no_oob.params())

    # Show we can get a confidence interval out here
    preds, conf = fit.predict(5, xreg_test, return_conf_int=True)
    assert all(isinstance(a, np.ndarray) for a in (preds, conf))
Ejemplo n.º 8
0
def test_oob_for_issue_29():
    dta = sm.datasets.sunspots.load_pandas().data
    dta.index = pd.Index(sm.tsa.datetools.dates_from_range('1700', '2008'))
    del dta["YEAR"]

    xreg = np.random.RandomState(1).rand(dta.shape[0], 3)

    # Try for cv on/off, various D levels, and various Xregs
    for d in (0, 1):
        for cv in (0, 3):
            for exog in (xreg, None):

                # surround with try/except so we can log the failing combo
                try:
                    model = ARIMA(order=(2, d, 0),
                                  out_of_sample_size=cv).fit(dta,
                                                             exogenous=exog)

                    # If exogenous is defined, we need to pass n_periods of
                    # exogenous rows to the predict function. Otherwise we'll
                    # just leave it at None
                    if exog is not None:
                        xr = exog[:3, :]
                    else:
                        xr = None

                    _, _ = model.predict(n_periods=3,
                                         return_conf_int=True,
                                         exogenous=xr)

                except Exception as ex:
                    print("Failing combo: d=%i, cv=%i, exog=%r" %
                          (d, cv, exog))

                    # Statsmodels can be fragile with ARMA coefficient
                    # computation. If we encounter that, pass:
                    #   ValueError: The computed initial MA coefficients are
                    #       not invertible. You should induce invertibility,
                    #       choose a different model order, or ...
                    if "invertibility" in str(ex):
                        pass
                    else:
                        raise
Ejemplo n.º 9
0
def test_basic_arma():
    arma = ARIMA(order=(0, 0, 0), suppress_warnings=True)
    preds = arma.fit_predict(y)  # fit/predict for coverage

    # No OOB, so assert none
    assert arma.oob_preds_ is None

    # test some of the attrs
    assert_almost_equal(arma.aic(), 11.201, decimal=3)  # equivalent in R

    # intercept is param 0
    intercept = arma.params()[0]
    assert_almost_equal(intercept, 0.441, decimal=3)  # equivalent in R
    assert_almost_equal(arma.aicc(), 11.74676, decimal=5)
    assert_almost_equal(arma.bic(), 13.639060053303311, decimal=5)

    # get predictions
    expected_preds = np.array([0.44079876, 0.44079876, 0.44079876,
                               0.44079876, 0.44079876, 0.44079876,
                               0.44079876, 0.44079876, 0.44079876,
                               0.44079876])

    # generate predictions
    assert_array_almost_equal(preds, expected_preds)

    # Make sure we can get confidence intervals
    expected_intervals = np.array([
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139],
        [-0.10692387, 0.98852139]
    ])

    _, intervals = arma.predict(n_periods=10, return_conf_int=True,
                                alpha=0.05)
    assert_array_almost_equal(intervals, expected_intervals)
Ejemplo n.º 10
0
def test_with_oob():
    # show we can fit with CV (kinda)
    arima = ARIMA(order=(2, 1, 2),
                  suppress_warnings=True,
                  out_of_sample_size=10).fit(y=hr)
    assert not np.isnan(arima.oob())  # show this works

    # show we can fit if ooss < 0 and oob will be nan
    arima = ARIMA(order=(2, 1, 2), suppress_warnings=True,
                  out_of_sample_size=-1).fit(y=hr)
    assert np.isnan(arima.oob())

    # This will raise since n_steps is not an int
    assert_raises(TypeError, arima.predict, n_periods="5")

    # But that we CAN forecast with an int...
    _ = arima.predict(n_periods=5)  # noqa: F841

    # Show we fail if cv > n_samples
    assert_raises(ValueError,
                  ARIMA(order=(2, 1, 2), out_of_sample_size=1000).fit, hr)
Ejemplo n.º 11
0
def ragged_fill_series(
    series,
    function=np.nanmean,
    backup_fill_method=np.nanmean,
    est_series=None,
    fitted_arma=None,
    arma_full_series=None,
):
    """Filling in the ragged ends of a series, adhering to the periodicity of the series. If there is only one observation and periodicity cannot be determined, series will be returned unchanged.

    parameters:
            :series: list/pandas Series: the series to fill the ragged edges of. Missings should be np.nans
    :function: the function to fill nas with (e.g. np.nanmean, etc.). Use "ARMA" for ARMA filling
    :backup_fill_method: function: which function to fill ragged edges with in case ARMA can't be estimated
    :est_series: list/pandas Series: optional, the series to calculate the fillna and/or ARMA function on. Should not have nas filled in yet by any method. E.g. a train set. If None, will calculated based on itself.
    :fitted_arma: optional, fitted ARMA model if available to avoid reestimating every time in the `gen_ragged_X` function
    :arma_full_series: optional, for_full_arma_dataset output of `gen_dataset` function. Fitting the ARMA model on the full series history rather than just the series provided

    output:
            :return: pandas Series with filled ragged edges
    """
    result = pd.Series(series).copy()
    if est_series is None:
        est_series = result.copy()

    # periodicity of the series, to see which to fill in
    nonna_bools = ~pd.isna(series)
    nonna_indices = list(
        nonna_bools.index[nonna_bools])  # existing indices with values
    # if there is only one non-na observation, can't determine periodicity or position in full series, don't fill anything
    if len(nonna_indices) > 1:
        periodicity = int(
            (pd.Series(result[~pd.isna(result)].index) -
             (pd.Series(result[~pd.isna(result)].index)).shift()
             ).mode()[0])  # how often data comes (quarterly, monthly, etc.)
        last_nonna = result.index[result.notna()][-1]
        fill_indices = nonna_indices + [
            int(nonna_indices[-1] + periodicity * i)
            for i in range(1, (len(series) - last_nonna))
        ]  # indices to be filled in, including only the correct periodicity
        fill_indices = [x for x in fill_indices if x in series.index
                        ]  # cut down on the indices if went too long

        if function == "ARMA":
            # estimate the model if not given
            if fitted_arma is None:
                fitted_arma = estimate_arma(est_series)
            # instantiate model with previously estimated parameters (i.e. on train set)
            arma = ARIMA(order=fitted_arma.order)
            arma.set_params(**fitted_arma.get_params())

            # refit the model on the full series to this point
            if arma_full_series is not None:
                y = list(arma_full_series[~pd.isna(arma_full_series)])
                present = list(result[~pd.isna(result)])
                # limit the series to the point where actuals are
                end_index = 0
                for i in range(len(present), len(y) + 1):
                    if list(y[(i - len(present)):i]) == list(present):
                        end_index = i
                y = y[:end_index]
            # refit model on just this series
            else:
                y = list(result[~pd.isna(result)])  # refit the model on data
                present = y.copy()
            # can fail if not enough datapoints for order of ARMA process
            try:
                arma.fit(y, error_action="ignore")
                preds = arma.predict(n_periods=int(len(series) - last_nonna))
                fills = list(present) + list(preds)
                fills = fills[:len(fill_indices)]
            except:
                fills = list(result[~pd.isna(result)]) + [
                    backup_fill_method(est_series)
                ] * (len(series) - last_nonna)
                fills = fills[:len(fill_indices)]
            result[fill_indices] = fills
        else:
            fills = list(result[~pd.isna(result)]) + [function(est_series)] * (
                len(series) - last_nonna)
            fills = fills[:len(fill_indices)]
            result[fill_indices] = fills

    return result, fitted_arma
Ejemplo n.º 12
0
def test_oob_for_issue_28():
    # Continuation of above: can we do one with an exogenous array, too?
    xreg = rs.rand(hr.shape[0], 4)
    arima = ARIMA(order=(2, 1, 2),
                  suppress_warnings=True,
                  out_of_sample_size=10).fit(y=hr, exogenous=xreg)

    oob = arima.oob()
    assert not np.isnan(oob)

    # Assert that the endog shapes match. First is equal to the original,
    # and the second is the differenced array, with original shape - d.
    assert np.allclose(arima.arima_res_.data.endog, hr, rtol=1e-2)
    assert arima.arima_res_.model.endog.shape[0] == hr.shape[0] - 1

    # Now assert the same for exog
    assert np.allclose(arima.arima_res_.data.exog, xreg, rtol=1e-2)
    assert arima.arima_res_.model.exog.shape[0] == xreg.shape[0] - 1

    # Compare the OOB score to an equivalent fit on data - 10 obs, but
    # without any OOB scoring, and we'll show that the OOB scoring in the
    # first IS in fact only applied to the first (train - n_out_of_bag)
    # samples
    arima_no_oob = ARIMA(order=(2, 1, 2),
                         suppress_warnings=True,
                         out_of_sample_size=0).fit(y=hr[:-10],
                                                   exogenous=xreg[:-10, :])

    scoring = get_callable(arima_no_oob.scoring, VALID_SCORING)
    preds = arima_no_oob.predict(n_periods=10, exogenous=xreg[-10:, :])
    assert np.allclose(oob, scoring(hr[-10:], preds), rtol=1e-2)

    # Show that the model parameters are exactly the same
    xreg_test = rs.rand(5, 4)
    assert np.allclose(arima.params(), arima_no_oob.params(), rtol=1e-2)

    # Now assert on the forecast differences.
    with_oob_forecasts = arima.predict(n_periods=5, exogenous=xreg_test)
    no_oob_forecasts = arima_no_oob.predict(n_periods=5, exogenous=xreg_test)

    assert_raises(AssertionError, assert_array_almost_equal,
                  with_oob_forecasts, no_oob_forecasts)

    # But after we update the no_oob model with the latest data, we should
    # be producing the same exact forecasts

    # First, show we'll fail if we try to add observations with no exogenous
    assert_raises(ValueError, arima_no_oob.add_new_observations, hr[-10:],
                  None)

    # Also show we'll fail if we try to add mis-matched shapes of data
    assert_raises(ValueError, arima_no_oob.add_new_observations, hr[-10:],
                  xreg_test)

    # Show we fail if we try to add observations with a different dim exog
    assert_raises(ValueError, arima_no_oob.add_new_observations, hr[-10:],
                  xreg_test[:, :2])

    # Actually add them now, and compare the forecasts (should be the same)
    arima_no_oob.add_new_observations(hr[-10:], xreg[-10:, :])
    assert np.allclose(with_oob_forecasts,
                       arima_no_oob.predict(n_periods=5, exogenous=xreg_test),
                       rtol=1e-2)
Ejemplo n.º 13
0
class ARIMAModel(BaseModel):
    def __init__(self):
        """
        Initialize Model
        """
        self.seasonal = True
        self.metric = 'mse'
        self.model = None
        self.model_init = False

    def _build(self, **config):
        """
        build the models and initialize.
        :param config: hyperparameters for the model
        """
        p = config.get('p', 2)
        d = config.get('d', 0)
        q = config.get('q', 2)
        self.seasonal = config.get('seasonality_mode', True)
        P = config.get('P', 1)
        D = config.get('D', 0)
        Q = config.get('Q', 1)
        m = config.get('m', 7)
        self.metric = config.get('metric', self.metric)

        order = (p, d, q)
        if not self.seasonal:
            seasonal_order = (0, 0, 0, 0)
        else:
            seasonal_order = (P, D, Q, m)

        self.model = ARIMA(order=order,
                           seasonal_order=seasonal_order,
                           suppress_warnings=True)

    def fit_eval(self, data, validation_data, **config):
        """
        Fit on the training data from scratch.
        :param data: A 1-D numpy array as the training data
        :param validation_data: A 1-D numpy array as the evaluation data
        :return: the evaluation metric value
        """

        if not self.model_init:
            # Estimating differencing term (d) and seasonal differencing term (D)
            kpss_diffs = ndiffs(data, alpha=0.05, test='kpss', max_d=6)
            adf_diffs = ndiffs(data, alpha=0.05, test='adf', max_d=6)
            d = max(adf_diffs, kpss_diffs)
            D = 0 if not self.seasonal else nsdiffs(data, m=7, max_D=12)
            config.update(d=d, D=D)

            self._build(**config)
            self.model_init = True

        self.model.fit(data)
        val_metric = self.evaluate(x=None,
                                   target=validation_data,
                                   metrics=[self.metric])[0].item()
        return {self.metric: val_metric}

    def predict(self, x=None, horizon=24, update=False, rolling=False):
        """
        Predict horizon time-points ahead the input x in fit_eval
        :param x: ARIMA predicts the horizon steps foreward from the training data.
            So x should be None as it is not used.
        :param horizon: the number of steps forward to predict
        :param update: whether to update the original model
        :param rolling: whether to use rolling prediction
        :return: predicted result of length horizon
        """
        if x is not None:
            raise ValueError("x should be None")
        if update and not rolling:
            raise Exception(
                "We don't support updating model without rolling prediction currently"
            )
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling predict"
            )

        if not update and not rolling:
            forecasts = self.model.predict(n_periods=horizon)
        elif rolling:
            if not update:
                self.save("tmp.pkl")

            forecasts = []
            for step in range(horizon):
                fc = self.model.predict(n_periods=1).item()
                forecasts.append(fc)

                # Updates the existing model with a small number of MLE steps for rolling prediction
                self.model.update(fc)

            if not update:
                self.restore("tmp.pkl")
                os.remove("tmp.pkl")

        return forecasts

    def evaluate(self, target, x=None, metrics=['mse'], rolling=False):
        """
        Evaluate on the prediction results and y. We predict horizon time-points ahead the input x
        in fit_eval before evaluation, where the horizon length equals the second dimension size of
        y.
        :param target: target for evaluation.
        :param x: ARIMA predicts the horizon steps foreward from the training data.
            So x should be None as it is not used.
        :param metrics: a list of metrics in string format
        :param rolling: whether to use rolling prediction
        :return: a list of metric evaluation results
        """
        if x is not None:
            raise ValueError("We don't support input x currently")
        if target is None:
            raise ValueError("Input invalid target of None")
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling evaluate"
            )

        forecasts = self.predict(horizon=len(target), rolling=rolling)

        return [Evaluator.evaluate(m, target, forecasts) for m in metrics]

    def save(self, checkpoint_file):
        if self.model is None:
            raise Exception(
                "Needs to call fit_eval or restore first before calling save")
        with open(checkpoint_file, 'wb') as fout:
            pickle.dump(self.model, fout)

    def restore(self, checkpoint_file):
        with open(checkpoint_file, 'rb') as fin:
            self.model = pickle.load(fin)
        self.model_init = True
Ejemplo n.º 14
0
class Model:
    def select_data(self):
        # Merge columns into a single dataframe of observed values, based on date
        dataset = files.data_main.join(files.data_exo.set_index('date'),
                                       on='date').dropna()
        # Select part of the precipitation dataframe that corresponds to the forecast
        obs_end = dataset.tail(1)['date'].values[0]
        exo_prev = files.data_exo[(files.data_exo['date'] > obs_end)]
        # Select predict dates
        self.dates_prev = exo_prev['date']
        # Reshape
        endo_obs = np.array(dataset['endo_value'])
        self.endo_obs = endo_obs.reshape(-1, 1)
        exo_obs = np.array(dataset['exo_value'])
        self.exo_obs = exo_obs.reshape(-1, 1)
        exo_prev = np.array(exo_prev['exo_value'])
        self.exo_prev = exo_prev.reshape(-1, 1)

    def normalize(self):
        # Calculate lambda only if doesn't have zero values
        n_zeros = len(self.endo_obs[self.endo_obs <= 0])
        if n_zeros == 0:
            self.endo_obs2, self.lambda_boxcox = boxcox(self.endo_obs)
        else:
            self.lambda_boxcox = -999
        # Limit lambda values
        if abs(self.lambda_boxcox[0]) > 1:
            self.endo_obs2 = self.endo_obs
            self.lambda_boxcox = -999
        #print(self.endo_obs2, self.lambda_boxcox)

    def run_auto(self):
        self.arima_model = auto_arima(self.endo_obs2,
                                      start_p=0,
                                      start_d=0,
                                      start_q=0,
                                      max_p=3,
                                      max_d=1,
                                      max_q=3,
                                      start_P=0,
                                      start_Q=0,
                                      D=1,
                                      seasonal=False,
                                      m=1,
                                      exogeneous=self.exo_obs,
                                      trace=True,
                                      error_action='ignore',
                                      suppress_warnings=True,
                                      stepwise=True)
        #print(model.arima_model.summary())
        # Compile parameters to list
        self.parameters = [
            self.arima_model.order, self.arima_model.seasonal_order,
            self.lambda_boxcox[0],
            self.arima_model.aic()
        ]
        print(self.parameters)
        return (self.arima_model)

    def run_auto_arimax(self):
        lower_aic = float(99999)
        best_pdq = [0, 0, 0]
        param = list(itertools.product(range(0, 4), range(0, 2), range(0, 4)))
        for pdq in param:
            #print(pdq)
            try:
                self.arima_model = ARIMA(order=pdq,
                                         suppress_warnings=True).fit(
                                             y=self.endo_obs2,
                                             exogenous=self.exo_obs)
                if self.arima_model.aic() < lower_aic:
                    lower_aic = self.arima_model.aic()
                    best_pdq = tuple(self.arima_model.order)
            except:
                continue
        #print(model.arima_model.summary())
        # Compile parameters to list
        self.parameters = [best_pdq, self.lambda_boxcox[0], lower_aic]
        print(self.parameters)
        return (self.arima_model)

    def run_auto_sarimax(self):
        lower_aic = float(99999)
        best_pdq = [0, 0, 0]
        best_spdq = [0, 0, 0]
        param = list(itertools.product(range(0, 4), range(0, 2), range(0, 4)))
        m = 1  # frequency
        param_seasonal = [(x[0], x[1], x[2], m) for x in list(
            itertools.product(range(0, 4), range(0, 2), range(0, 4)))]
        for pdq in param:
            for spdq in param_seasonal:
                try:
                    mod = sm.tsa.statespace.SARIMAX(
                        self.endo_obs2,
                        exog=self.exo_obs,
                        order=pdq,
                        seasonal_order=spdq,
                        enforce_stationarity=False,
                        enforce_invertibility=False)
                    self.arima_model = mod.fit(disp=0)
                    print('ARIMA{}x{}{} - AIC:{}'.format(
                        pdq, spdq, m, self.arima_model.aic))
                    if self.arima_model.aic() < lower_aic:
                        lower_aic = self.arima_model.aic()
                        best_pdq = tuple(pdq)
                        best_spdq = tuple(spdq)
                except:
                    continue
        #print(model.arima_model.summary())
        # Compile parameters to list
        self.parameters = [
            best_pdq, best_spdq, self.lambda_boxcox[0], lower_aic
        ]
        print(self.parameters)
        return (self.arima_model)

    def forecast(self):
        #self.predict = self.arima_model.predict(n_periods=self.exo_prev.shape[0], exogenous=self.exo_prev)
        self.predict = self.arima_model.predict(
            n_periods=self.exo_prev.shape[0],
            exogenous=self.exo_prev,
            return_conf_int=True,
            alpha=0.7)

    def renormalize(self):
        if self.lambda_boxcox == float(-999):
            self.predict_mean = self.predict[0]
            self.predict_down = self.predict[1][:, 0]
            self.predict_up = self.predict[1][:, 1]
        else:
            self.predict_mean = inv_boxcox(self.predict[0], self.lambda_boxcox)
            self.predict_down = inv_boxcox(self.predict[1][:, 0],
                                           self.lambda_boxcox)
            self.predict_up = inv_boxcox(self.predict[1][:, 1],
                                         self.lambda_boxcox)
        # Join predict dates with values into a dataframe
        df_final = pd.DataFrame(self.predict_mean, self.dates_prev)
        df_final.columns = ['endo_value']
        return (df_final)
Ejemplo n.º 15
0
# Plot Residuals and fitted values
# plt.figure()
# fitted_values = arima.predict_in_sample()
# plt.plot(df.index[:train_len - 1], fitted_values,
#          color='C0', label="Fitted values")
# plt.plot(pd.to_datetime(df.index), data, color='C1', label="Data")
# plt.plot(df.index[:train_len - 1], arima.resid(),
#          color='C2', label="Residuals")
# plt.gca().grid(which='both', axis='x', linestyle='--')
# plt.title("Residuals and fitted values")
# plt.legend()

print("SSE: {}".format((arima.resid()**2).sum()))

# Plot fitted values and forecasts
predictions = arima.predict(n_periods=test.shape[0])
fitted_values = arima.predict_in_sample()
plt.figure()
plt.plot(df.index[train_len:], test, '--', color='C0', label="test set")
plt.plot(df.index[train_len:],
         predictions,
         '--',
         color='C1',
         label="forecasted values")
plt.plot(df.index[:train_len], train, color='C0', label="train set")
plt.plot(df.index[:train_len - 1],
         fitted_values,
         color='C1',
         label="fitted values")
plt.legend()
plt.title("Fitted values and forecasts")
Ejemplo n.º 16
0
def predict_arima(df):

    time_in=current_milli_time()
    try:
        forecast_in = open("forecast.pickle","rb")
        future_forecast = pickle.load(forecast_in)
        forecast_in.append(df)
        error=[]
        """
        Calculate errors
        """
        if len(df) < len(future_forecast):
            error=df["memory_used"] - future_forecast[:len(df)]["memory_used"]
        elif len(df) > len(future_forecast):
            error=df[0:len(future_forecast)]["memory_used"]- future_forecast["memory_used"]
        else:
            error=df["memory_used"]-future_forecast["memory_used"]
        overestimation=[x for x in error if x<0]
        overestimation=sum(overestimation)/len(overestimation)
        underestimation=[x for x in error if x>=0]
        underestimation=sum(underestimation)/len(underestimation)
        print("UNDERESTIMATION ERROR: "+underestimation)
        print("OVERESTIMATION ERROR: "+overestimation)
        print("Mean Absolute Error in Last iteration "+str(error))
        """
        Overestimation & Underestimation errors
        """



    except Exception as e:
        print("RMSE To be computed")
        # Do Nothing
  
    try:
        pm.plot_pacf(df,show=False).savefig('pacf.png')
        pm.plot_acf(df,show=False).savefig('acf.png')
    except:
        print("Data points insufficient for ACF & PACF")


    try:
        pickle_in = open("arima.pickle","rb")
        arima_data = pickle.load(pickle_in)
        arima_data.append(df)
        #df=arima_data
    except Exception as e:
        arima_data_out = open("arima.pickle","wb")    
        pickle.dump([], arima_data_out)
    arima_data_out = open("arima.pickle","wb")
    pickle.dump(df, arima_data_out)
    arima_data_out.close()
    
    '''
    tests 
    '''
    nd=1
    nsd=1
    try:
        adf_test=ADFTest(alpha=0.05)
        p_val, should_diff = adf_test.is_stationary(df["memory_used"])    

        nd = ndiffs(df, test='adf')
        logging.info(nd)
        nsd = nsdiffs(df,12)
        logging.info(nd)
    except:
        nd=1
        print("Exception on tests")

    ch_test=CHTest(12)
    
    try:
        nsd=ch_test.estimate_seasonal_differencing_term(df)
    except Exception as e:
        print(e)
        logging.error(e)
    

    '''
        ARIMA MODEL
    '''

    '''
        Find p,q dynamically
    '''
    acf_lags=acf(df["memory_used"])
    acf_lags_threshold=[x for x in acf_lags if x>=getThreshold()]
    p=len(acf_lags_threshold) if len(acf_lags_threshold)<=4 else 4

    pacf_lags=pacf(df["memory_used"])
    pacf_lags_threshold=[x for x in pacf_lags if x>=getThreshold()]
    q=len(pacf_lags_threshold) if len(pacf_lags_threshold)<=1 else 1
    d=nd

    train, test = train_test_split(df,shuffle=False, test_size=0.3)

    # If data is seasonal set the values of P,D,Q in seasonal order
    stepwise_model = ARIMA(
        order=(p,d,q),
        seasonal_order=(0,nsd,0,12),
        suppress_warnings=True,
        scoring='mse'
    )
    x=str(p)+" "+str(nd)+" "+str(q)
    print("Model with p="+str(q)+" d="+str(d)+" q="+str(q))

    try:

        stepwise_model.fit(df)
        """ 
          Vary the periods as per the forecasting window 
          n_periods= 30 = 5mins
          n_periods= 60 = 10mins
          n_periods= 90 = 15mins
        """
        future_forecast = stepwise_model.predict(n_periods=len(test))
        future_forecast = pd.DataFrame(future_forecast,index=test.index,columns=["prediction"])

        res=pd.concat([df,future_forecast],axis=1)

        '''
            Save Forecast in Pickle 
        '''
        forecast_out = open("forecast.pickle","wb")
        pickle.dump(future_forecast,forecast_out)
        forecast_out.close()
        
        trace1 = go.Scatter(x=res.index, y=res["prediction"],name="Prediction", mode='lines')
        trace2 = go.Scatter(x=df.index, y=df["memory_used"],name="DF data", mode='lines')
        data=[trace1,trace2]
        layout = go.Layout(
            title=x
        )
        fig = go.Figure(data=data, layout=layout)
        plot(fig, filename="prediction")
        print("Current values")
        print(df)
        print("Predicted Data Points")
        print(future_forecast)
        time_out=current_milli_time()
        print("TIME for RNN(ms):"+str(time_out-time_in))
        return future_forecast
    except Exception as e:
        time_out=current_milli_time()
        print("TIME for RNN(ms):"+str(time_out-time_in))
        print(e)
        return None
Ejemplo n.º 17
0
def model_plot(days):
    days = int(days)
    pd.plotting.register_matplotlib_converters()

    df = pd.read_csv('data/new_york.csv')
    df['Date'] = pd.to_datetime(df['Date'])

    #converting data to daily usage.
    df.index = df.Date
    df = df.drop('Date', axis=1)
    # resample the dataframe every 1 day (D) and sum ovr each day
    df = df.resample('D').sum()
    df = df.tz_localize(None)

    nyc_weather = pd.read_csv('data/weather/weatherNY.csv')
    nyc_weather['DATE'] = pd.to_datetime(nyc_weather['DATE'])
    nyc_weather = nyc_weather.set_index('DATE')
    nyc_weather.drop(['NAME','STATION'],axis=1,inplace=True)
    nyc_weather = nyc_weather['2015-07-01':'2020-08-10']

    df = df[:'2020-08-10']

    #trying 1 day increments with EXOG. MAYBE BEST CANDIDATE? with fourier terms june to june as 638 and august to august 516
    day = days
    real_values = []
    predictions = []

    df1 = df["2016":"2019"]
    nyc_weather = nyc_weather["2016":"2019"]

    y = df1.Consumption

    exog = pd.DataFrame({'date': y.index})
    exog = exog.set_index(pd.PeriodIndex(exog['date'], freq='D'))
    exog['is_weekend'] = np.where(exog.index.dayofweek < 5,0,1)

    #add weather data
    exog['TMIN'] = nyc_weather['TMIN'].values
    exog['sin1'] = np.sin(2 * np.pi * exog.index.dayofyear / 638)
    exog['cos1'] = np.cos(2 * np.pi * exog.index.dayofyear / 638)
    exog['sin2'] = np.sin(4 * np.pi * exog.index.dayofyear /638)
    exog['cos2'] = np.cos(4 * np.pi * exog.index.dayofyear /638)
    exog['sin3'] = np.sin(2 * np.pi * exog.index.dayofyear / 516)
    exog['cos3'] = np.cos(2 * np.pi * exog.index.dayofyear / 516)
    exog['sin4'] = np.sin(4 * np.pi * exog.index.dayofyear /516)
    exog['cos4'] = np.cos(4 * np.pi * exog.index.dayofyear /516)



    exog = exog.drop(columns=['date'])

    num_to_update = 0
    y_to_train = y.iloc[:(len(y)-100)]    
    exog_to_train = exog.iloc[:(len(y)-100)]

    dates = []

    steps = []

    for i in range(5):

        #first iteration train the model
        if i == 0:
            arima_exog_model = ARIMA(order=(3, 0, 1), seasonal_order=(2, 0, 0, 7),exogenous=exog_to_train, error_action='ignore',
                                    initialization='approximate_diffuse', suppress_warnings=True).fit(y=y_to_train)  

            preds = arima_exog_model.predict_in_sample(exog_to_train)            
            #first prediction
            y_to_test = y.iloc[(len(y)-100):(len(y)-100+day)]
            y_exog_to_test = exog.iloc[(len(y)-100):(len(y)-100+day)]
            y_arima_exog_forecast = arima_exog_model.predict(n_periods=day, exogenous=y_exog_to_test)
            
            real_values.append(y_to_test.values)
            predictions.append(y_arima_exog_forecast.tolist())
            
            dates.append(y_to_test.index)
            steps.append(y_to_test.index[-1])
                                                    
            #y_arima_exog_forecast = arima_exog_model.predict(n_periods=2, exogenous=exog_to_test)
        else:
            y_to_update = y.iloc[(len(y)-100+num_to_update):(len(y)-100+num_to_update)+day]
            exog_to_update = exog.iloc[(len(y)-100+num_to_update):(len(y)-100+num_to_update)+day]

            #to test
            to_test = y.iloc[(len(y)-100+num_to_update)+day:(len(y)-100+num_to_update)+(day*2)]
            exog_to_test = exog.iloc[(len(y)-100+num_to_update)+day:(len(y)-100+num_to_update)+(day*2)]
            #update the model

            arima_exog_model.update(y_to_update,exogenous=exog_to_update)
            y_arima_exog_forecast = arima_exog_model.predict(n_periods=day, exogenous=exog_to_test)

            dates.append(to_test.index)
            steps.append(to_test.index[-1])

            predictions.append(y_arima_exog_forecast.tolist())    
            real_values.append(to_test.values)
            
            num_to_update += day


    predict =  [item for sublist in predictions for item in sublist]
    true = [item for sublist in real_values for item in sublist]
    dates = [item for sublist in dates for item in sublist]

    #for viz purposes
    y_to_train2 = y_to_train[-200:]
    preds = preds[-200:]
    y_to_train2 = y_to_train2.to_frame()
    fig = go.Figure()
    # Create and style traces
    fig.add_trace(go.Scatter(x=y_to_train2.index, y=y_to_train2.Consumption, name='True values',
                            line=dict(color='firebrick', width=4,dash='dot')))

    fig.add_trace(go.Scatter(x=y_to_train2.index, y=preds[-200:], name='In-sample Prediction',
                            line=dict(color='royalblue', width=4)))

    fig.add_trace(go.Scatter(x=dates, y=predict, name='Prediction',
                            line=dict(color='green', width=4)))

    fig.add_trace(go.Scatter(x=dates, y=true, name='True',
                            line=dict(color='firebrick', width=4,dash='dot')))

    fig.update_layout(title='Electricity Consumption in New York',
                    xaxis_title='Date',
                    yaxis_title='Consumption',
                    xaxis_showgrid=True,
                    yaxis_showgrid=True,
                    #autosize=False,
                    #width=500,
                    #height=500,
                    paper_bgcolor=app_colors['background'], 
                    plot_bgcolor=app_colors['background'])


    return fig