def test_AutoARIMA_class(): train, test = wineind[:125], wineind[125:] mod = AutoARIMA(maxiter=5) mod.fit(train) endog = mod.model_.arima_res_.data.endog assert_array_almost_equal(train, endog) # update mod.update(test, maxiter=2) new_endog = mod.model_.arima_res_.data.endog assert_array_almost_equal(wineind, new_endog)
def test_pipeline_behavior(): pipeline = Pipeline([("fourier", FourierFeaturizer(m=12)), ("boxcox", BoxCoxEndogTransformer()), ("arima", AutoARIMA(seasonal=False, stepwise=True, suppress_warnings=True, d=1, max_p=2, max_q=0, start_q=0, start_p=1, maxiter=3, error_action='ignore'))]) # Quick assertions on indexing assert len(pipeline) == 3 pipeline.fit(train) preds = pipeline.predict(5) assert preds.shape[0] == 5 assert pipeline._final_estimator.model_.fit_with_exog_ # Assert that when the n_periods kwarg is set manually and incorrectly for # the fourier transformer, we get a ValueError kwargs = {"fourier__n_periods": 10} with pytest.raises(ValueError) as ve: pipeline.predict(3, **kwargs) assert "'n_periods'" in pytest_error_str(ve) # Assert that we can update the model pipeline.update(test, maxiter=5) # And that the fourier transformer was updated properly... assert pipeline.steps_[0][1].n_ == wineind.shape[0]
class AutoArima: def __init__(self, args): self.model = AutoARIMA() self.seq_len_x = args.seq_len_x self.out_seq_len = args.out_seq_len self.args = args def predict(self, x): # input [batch, in_seq_len, n] b, seq_x, n = x.shape x = np.reshape(x, [-1, seq_x]) n_samples, _ = x.shape xhat = [] for i in range(n_samples): y = self.model.fit_predict(x[i], n_periods=self.out_seq_len) xhat.append(y) xhat = np.stack(xhat, axis=-1) xhat = np.reshape(xhat, (b, self.out_seq_len, n)) return xhat # (b, out_len, n)
def test_arima_setup(params, X): """Checks if parameters are passed to Auto-Arima correctly""" coverage = 0.99 model = AutoArimaEstimator(score_func=mean_squared_error, coverage=coverage, null_model_params=None, **params) # set_params must be able to replicate the init model2 = AutoArimaEstimator() model2.set_params(**dict(score_func=mean_squared_error, coverage=coverage, null_model_params=None, **params)) assert model2.__dict__ == model.__dict__ model.fit(X) direct_model = AutoARIMA(**params) model_params = model.model.__dict__ direct_model_params = direct_model.__dict__ assert model_params["start_p"] == direct_model_params["start_p"] assert model_params["d"] == direct_model_params["d"] assert model_params["start_q"] == direct_model_params["start_q"] assert model_params["max_p"] == direct_model_params["max_p"] assert model_params["max_d"] == direct_model_params["max_d"] assert model_params["max_q"] == direct_model_params["max_q"] assert model_params["start_P"] == direct_model_params["start_P"] assert model_params["D"] == direct_model_params["D"] assert model_params["start_Q"] == direct_model_params["start_Q"] assert model_params["max_P"] == direct_model_params["max_P"] assert model_params["max_D"] == direct_model_params["max_D"] assert model_params["max_Q"] == direct_model_params["max_Q"] assert model_params["max_order"] == direct_model_params["max_order"] assert model_params["m"] == direct_model_params["m"] assert model_params["seasonal"] == direct_model_params["seasonal"] assert model_params["stationary"] == direct_model_params["stationary"] assert model_params["information_criterion"] == direct_model_params[ "information_criterion"] assert model_params["alpha"] == direct_model_params["alpha"] assert model_params["test"] == direct_model_params["test"] assert model_params["seasonal_test"] == direct_model_params[ "seasonal_test"] assert model_params["stepwise"] == direct_model_params["stepwise"] assert model_params["n_jobs"] == direct_model_params["n_jobs"] assert model_params["start_params"] == direct_model_params["start_params"] assert model_params["trend"] == direct_model_params["trend"] assert model_params["method"] == direct_model_params["method"] assert model_params["maxiter"] == direct_model_params["maxiter"] assert model_params["offset_test_args"] == direct_model_params[ "offset_test_args"] assert model_params["seasonal_test_args"] == direct_model_params[ "seasonal_test_args"] assert model_params["suppress_warnings"] == direct_model_params[ "suppress_warnings"] assert model_params["error_action"] == direct_model_params["error_action"] assert model_params["trace"] == direct_model_params["trace"] assert model_params["random"] == direct_model_params["random"] assert model_params["random_state"] == direct_model_params["random_state"] assert model_params["n_fits"] == direct_model_params["n_fits"] assert model_params["out_of_sample_size"] == direct_model_params[ "out_of_sample_size"] assert model_params["scoring"] == direct_model_params["scoring"] assert model_params["scoring_args"] == direct_model_params["scoring_args"] assert model_params["with_intercept"] == direct_model_params[ "with_intercept"] assert model_params["kwargs"] == direct_model_params["kwargs"]
] ) def test_bad_last_stage(self, stages): # Will fail since the last stage is not an estimator with pytest.raises(TypeError) as ve: Pipeline(stages) assert "Last step of Pipeline should be" in pytest_error_str(ve) @pytest.mark.parametrize( 'pipe,kwargs,expected', [ pytest.param( Pipeline([ ("boxcox", BoxCoxEndogTransformer()), ("arima", AutoARIMA()) ]), {}, {"boxcox": {}, "arima": {}} ), pytest.param( Pipeline([ ("boxcox", BoxCoxEndogTransformer()), ("arima", AutoARIMA()) ]), {"boxcox__lmdba1": 0.001}, {"boxcox": {"lmdba1": 0.001}, "arima": {}} ), ] )
# Two transformers [("stage1", BoxCoxEndogTransformer()), ("stage2", FourierFeaturizer(m=12))] ]) def test_bad_last_stage(self, stages): # Will fail since the last stage is not an estimator with pytest.raises(TypeError) as ve: Pipeline(stages) assert "Last step of Pipeline should be" in pytest_error_str(ve) @pytest.mark.parametrize('pipe,kwargs,expected', [ pytest.param( Pipeline([("boxcox", BoxCoxEndogTransformer()), ("arima", AutoARIMA())]), {}, { "boxcox": {}, "arima": {} }), pytest.param( Pipeline([("boxcox", BoxCoxEndogTransformer()), ("arima", AutoARIMA())]), {"boxcox__lmdba1": 0.001}, { "boxcox": { "lmdba1": 0.001 }, "arima": {} }), ]) def test_get_kwargs(pipe, kwargs, expected): # Test we get the kwargs we expect kw = pipe._get_kwargs(**kwargs)
# -*- coding: utf-8 -*- from sklearn.base import clone from pmdarima.arima import ARIMA, AutoARIMA from pmdarima.pipeline import Pipeline from pmdarima.datasets import load_wineind from pmdarima.preprocessing import FourierFeaturizer import pytest y = load_wineind() @pytest.mark.parametrize( 'est', [ ARIMA(order=(2, 1, 1), seasonal_order=(0, 0, 0, 1)), AutoARIMA(seasonal=False, maxiter=3), Pipeline([ ("fourier", FourierFeaturizer(m=12)), ("arima", AutoARIMA(seasonal=False, stepwise=True, suppress_warnings=True, d=1, max_p=2, max_q=0, start_q=0, start_p=1, maxiter=3, error_action='ignore')) ]) ] ) def test_clonable(est): # fit it, then clone it est.fit(y) est2 = clone(est) assert isinstance(est2, est.__class__) assert est is not est2
def __init__(self, args): self.model = AutoARIMA() self.seq_len_x = args.seq_len_x self.out_seq_len = args.out_seq_len self.args = args
class AutoArimaEstimator(BaseForecastEstimator): """Wrapper for ``pmdarima.arima.AutoARIMA``. It currently does not handle the regressor issue when there is gap between train and predict periods. Parameters ---------- score_func : callable see ``BaseForecastEstimator``. coverage : float between [0.0, 1.0] see ``BaseForecastEstimator``. null_model_params : dict with arguments to define DummyRegressor null model, optional, default=None see ``BaseForecastEstimator``. regressor_cols: `list` [`str`], optional, default None A list of regressor columns used during training and prediction. If None, no regressor columns are used. See ``AutoArima`` documentation for rest of the parameter descriptions: * https://alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.AutoARIMA.html#pmdarima.arima.AutoARIMA Attributes ---------- model : ``AutoArima`` object Auto arima model object fit_df : `pandas.DataFrame` or None The training data used to fit the model. forecast : `pandas.DataFrame` Output of the predict method of ``AutoArima``. """ def __init__( self, # Null model parameters score_func: callable = mean_squared_error, coverage: float = 0.90, null_model_params: Optional[Dict] = None, # Additional parameters regressor_cols: Optional[List[str]] = None, freq: Optional[float] = None, # pmdarima fit parameters start_p: Optional[int] = 2, d: Optional[int] = None, start_q: Optional[int] = 2, max_p: Optional[int] = 5, max_d: Optional[int] = 2, max_q: Optional[int] = 5, start_P: Optional[int] = 1, D: Optional[int] = None, start_Q: Optional[int] = 1, max_P: Optional[int] = 2, max_D: Optional[int] = 1, max_Q: Optional[int] = 2, max_order: Optional[int] = 5, m: Optional[int] = 1, seasonal: Optional[bool] = True, stationary: Optional[bool] = False, information_criterion: Optional[str] = 'aic', alpha: Optional[int] = 0.05, test: Optional[str] = 'kpss', seasonal_test: Optional[str] = 'ocsb', stepwise: Optional[bool] = True, n_jobs: Optional[int] = 1, start_params: Optional[Dict] = None, trend: Optional[str] = None, method: Optional[str] = 'lbfgs', maxiter: Optional[int] = 50, offset_test_args: Optional[Dict] = None, seasonal_test_args: Optional[Dict] = None, suppress_warnings: Optional[bool] = True, error_action: Optional[str] = 'trace', trace: Optional[Union[int, bool]] = False, random: Optional[bool] = False, random_state: Optional[Union[int, callable]] = None, n_fits: Optional[int] = 10, out_of_sample_size: Optional[int] = 0, scoring: Optional[str] = 'mse', scoring_args: Optional[Dict] = None, with_intercept: Optional[Union[bool, str]] = "auto", # pmdarima predict parameters return_conf_int: Optional[bool] = True, dynamic: Optional[bool] = False): # Every subclass of BaseForecastEstimator must call super().__init__ super().__init__( score_func=score_func, coverage=coverage, null_model_params=null_model_params) self.regressor_cols = regressor_cols self.freq = freq self.start_p = start_p self.d = d self.start_q = start_q self.max_p = max_p self.max_d = max_d self.max_q = max_q self.start_P = start_P self.D = D self.start_Q = start_Q self.max_P = max_P self.max_D = max_D self.max_Q = max_Q self.max_order = max_order self.m = m self.seasonal = seasonal self.stationary = stationary self.information_criterion = information_criterion self.alpha = alpha self.test = test self.seasonal_test = seasonal_test self.stepwise = stepwise self.n_jobs = n_jobs self.start_params = start_params self.trend = trend self.method = method self.maxiter = maxiter self.offset_test_args = offset_test_args self.seasonal_test_args = seasonal_test_args self.suppress_warnings = suppress_warnings self.error_action = error_action self.trace = trace self.random = random self.random_state = random_state self.n_fits = n_fits self.out_of_sample_size = out_of_sample_size self.scoring = scoring self.scoring_args = scoring_args self.with_intercept = with_intercept self.return_conf_int = return_conf_int self.coverage = coverage self.dynamic = dynamic # set by the fit method self.model = None self.fit_df = None # set by the predict method self.forecast = None def fit(self, X, y=None, time_col=TIME_COL, value_col=VALUE_COL, **fit_params): """Fits ``ARIMA`` forecast model. Parameters ---------- X : `pandas.DataFrame` Input timeseries, with timestamp column, value column, and any additional regressors. The value column is the response, included in X to allow transformation by `sklearn.pipeline.Pipeline` y : ignored The original timeseries values, ignored. (The y for fitting is included in ``X``.) time_col : `str` Time column name in ``X`` value_col : `str` Value column name in ``X`` fit_params : `dict` additional parameters for null model Returns ------- self : self Fitted model is stored in ``self.model``. """ X = X.sort_values(by=time_col) # fits null model super().fit(X, y=y, time_col=time_col, value_col=value_col, **fit_params) self.fit_df = X # fits AutoArima model self.model = AutoARIMA( start_p=self.start_p, d=self.d, start_q=self.start_q, max_p=self.max_p, max_d=self.max_d, max_q=self.max_q, start_P=self.start_P, D=self.D, start_Q=self.start_Q, max_P=self.max_P, max_D=self.max_D, max_Q=self.max_Q, max_order=self.max_order, m=self.m, seasonal=self.seasonal, stationary=self.stationary, information_criterion=self.information_criterion, alpha=self.alpha, test=self.test, seasonal_test=self.seasonal_test, stepwise=self.stepwise, n_jobs=self.n_jobs, start_params=self.start_params, trend=self.trend, method=self.method, maxiter=self.maxiter, offset_test_args=self.offset_test_args, seasonal_test_args=self.seasonal_test_args, suppress_warnings=self.suppress_warnings, error_action=self.error_action, trace=self.trace, random=self.random, random_state=self.random_state, n_fits=self.n_fits, out_of_sample_size=self.out_of_sample_size, scoring=self.scoring, scoring_args=self.scoring_args, with_intercept=self.with_intercept, return_conf_int=self.return_conf_int, dynamic=self.dynamic, regressor_cols=self.regressor_cols ) # fits auto-arima if self.regressor_cols is None: reg_df = None else: reg_df = X[self.regressor_cols] self.model.fit(y=X[[value_col]], X=reg_df) return self def predict(self, X, y=None): """Creates forecast for the dates specified in ``X``. Currently does not support the regressor case where there is gap between train and predict periods. Parameters ---------- X: `pandas.DataFrame` Input timeseries with timestamp column and any additional regressors. Timestamps are the dates for prediction. Value column, if provided in ``X``, is ignored. y: ignored. Returns ------- predictions: `pandas.DataFrame` Forecasted values for the dates in ``X``. Columns: - ``TIME_COL``: dates - ``PREDICTED_COL``: predictions - ``PREDICTED_LOWER_COL``: lower bound of predictions - ``PREDICTED_UPPER_COL``: upper bound of predictions """ X = X.sort_values(by=self.time_col_) # Returns the cached result if applicable cached_predictions = super().predict(X=X) if cached_predictions is not None: return cached_predictions # Currently does not support the regressor case where # there is gap between train and predict periods if self.regressor_cols is None: fut_reg_df = None else: fut_df = X[X[self.time_col_] > self.fit_df[self.time_col_].iloc[-1]] fut_reg_df = fut_df[self.regressor_cols] # Auto-arima only accepts regressor values beyond `fit_df` if self.freq is None: self.freq = pd.infer_freq(self.fit_df[self.time_col_]) if self.freq == "H": self.freq = self.freq.lower() # np.timedelta recognizes lower case letters chosen_d = self.model.model_.order[1] # This is the value of the d chosen by auto-arima forecast_start = int((X[self.time_col_].iloc[0] - self.fit_df[self.time_col_].iloc[0])/np.timedelta64(1, self.freq)) if forecast_start < chosen_d: append_length = chosen_d - forecast_start # Number of NaNs to append to `pred_df` forecast_start = chosen_d # Auto-arima can not predict below the chosen d else: append_length = 0 forecast_end = int((X[self.time_col_].iloc[-1] - self.fit_df[self.time_col_].iloc[0])/np.timedelta64(1, self.freq)) predictions = self.model.predict_in_sample( X=fut_reg_df, start=forecast_start, end=forecast_end, dynamic=self.dynamic, return_conf_int=self.return_conf_int, alpha=(1-self.coverage) ) if append_length > 0: pred_df = pd.DataFrame({ TIME_COL: X[self.time_col_], PREDICTED_COL: np.append(np.repeat(np.nan, append_length), predictions[0]), PREDICTED_LOWER_COL: np.append(np.repeat(np.nan, append_length), predictions[1][:, 0]), PREDICTED_UPPER_COL: np.append(np.repeat(np.nan, append_length), predictions[1][:, 1]) }) else: pred_df = pd.DataFrame({ TIME_COL: X[self.time_col_], PREDICTED_COL: predictions[0], PREDICTED_LOWER_COL: predictions[1][:, 0], PREDICTED_UPPER_COL: predictions[1][:, 1] }) self.forecast = pred_df # Caches the predictions self.cached_predictions_ = pred_df return pred_df def summary(self): BaseForecastEstimator.summary(self) # AutoArima summary return self.model.summary()
def fit(self, X, y=None, time_col=TIME_COL, value_col=VALUE_COL, **fit_params): """Fits ``ARIMA`` forecast model. Parameters ---------- X : `pandas.DataFrame` Input timeseries, with timestamp column, value column, and any additional regressors. The value column is the response, included in X to allow transformation by `sklearn.pipeline.Pipeline` y : ignored The original timeseries values, ignored. (The y for fitting is included in ``X``.) time_col : `str` Time column name in ``X`` value_col : `str` Value column name in ``X`` fit_params : `dict` additional parameters for null model Returns ------- self : self Fitted model is stored in ``self.model``. """ X = X.sort_values(by=time_col) # fits null model super().fit(X, y=y, time_col=time_col, value_col=value_col, **fit_params) self.fit_df = X # fits AutoArima model self.model = AutoARIMA( start_p=self.start_p, d=self.d, start_q=self.start_q, max_p=self.max_p, max_d=self.max_d, max_q=self.max_q, start_P=self.start_P, D=self.D, start_Q=self.start_Q, max_P=self.max_P, max_D=self.max_D, max_Q=self.max_Q, max_order=self.max_order, m=self.m, seasonal=self.seasonal, stationary=self.stationary, information_criterion=self.information_criterion, alpha=self.alpha, test=self.test, seasonal_test=self.seasonal_test, stepwise=self.stepwise, n_jobs=self.n_jobs, start_params=self.start_params, trend=self.trend, method=self.method, maxiter=self.maxiter, offset_test_args=self.offset_test_args, seasonal_test_args=self.seasonal_test_args, suppress_warnings=self.suppress_warnings, error_action=self.error_action, trace=self.trace, random=self.random, random_state=self.random_state, n_fits=self.n_fits, out_of_sample_size=self.out_of_sample_size, scoring=self.scoring, scoring_args=self.scoring_args, with_intercept=self.with_intercept, return_conf_int=self.return_conf_int, dynamic=self.dynamic, regressor_cols=self.regressor_cols ) # fits auto-arima if self.regressor_cols is None: reg_df = None else: reg_df = X[self.regressor_cols] self.model.fit(y=X[[value_col]], X=reg_df) return self