def main(): args = get_input_args() main_logger = Logger("auto_arima app") try: assert args.dt_path.split('.')[-1] == 'csv' except AssertionError: main_logger.exception("Here only csv supported!") sys.exit("STOP") try: ts_df = pd.read_csv(args.dt_path, index_col='Date', delimiter=',', usecols=['Date', args.value_column], parse_dates=True) main_logger.info("Data of shape {0} read in.".format(str(ts_df.shape))) except IOError: main_logger.exception("File could not be read!") sys.exit("STOP") except (NameError, KeyError): main_logger.exception( "Incompatible file format! Expected columns 'Date' and " + args.value_column) sys.exit("STOP") # initiate tsf_obj = AutoARIMAForecaster(ts_df=ts_df, time_format=args.time_format, freq=args.freq, n_test=args.n_test, n_val=args.n_val) if args.transform != '': tsf_obj.ts_transform(args.transform) if input("Continue with ts_fit y/n?").strip().lower() == 'y': tsf_obj.ts_fit(suppress=args.suppress) else: main_logger.info("OK") if input("Continue with ts_diagnose y/n?").strip().lower() == 'y': tsf_obj.ts_diagnose() else: main_logger.info("OK") if input("Continue with ts_test y/n?").strip().lower() == 'y': tsf_obj.ts_test() else: main_logger.info("OK") if input("Continue with ts_forecast y/n?").strip().lower() == 'y': tsf_obj.ts_forecast(n_forecast=args.n_forecast) else: main_logger.info("OK")
class EnsembleForecaster(LinearForecaster, SARIMAForecaster, ExponentialSmoothingForecaster, AutoARIMAForecaster, ProphetForecaster, DLMForecaster): """Univariate time series class inheriting from all existing forecasters and choosing the best ensemble. Each forecaster is supposed to be equipped with y set of hyper parameters. Grid search is used to choose the best model for each forecaster among the respective hyper parameters. Those best models are then combined (Ensemble is created) to achieve the forecast of the best quality: All combinations of best models are created, forecasted values for all these combinations are either averaged or a median is computed. The best combination is chosen as the best ensemble. Note, that it is necessary that test and validation data are generated. Attributes ---------- _model_list: list Internal (immutable) list of possible models ensemble: list List of all forecasters to be used to create the best ensemble. _dict_models: dictionary Dictionary keeping all models dict_hyper_params: dictionary Dictionary of hyper parameters per forecaster show_plots: bool Whether to show plots when models are fitted/tested _best_models: dictionary Dictionary keeping best models for each forecaster type in the list 'ensemble'. This best one is chosen after applying the grid search. best_ensemble: dictionary Dictionary keeping the results of ensemble _ensemble_logger: Logger The logger for logging Methods ---------- assertions() Assertion tests, must be overrided ts_fit() Grid search on all forecasters in ensemble. Respective hyper parameters are used. ts_test() Test all forecasters on test data and computes rmse ts_validate() Validate all forecasters on validation data _build_ensemble() Builds the ensemble. All combinations of forecasters in ensemble is generated. For each combination the meand and median rmse over the validation data is computed. The best combination in terms of the best rmse is the best ensemble. """ def __init__(self, dict_hyper_params, ensemble=['dlm', 'prophet'], show_plots=True, **kwds): """Initialized the object EnsembleForecaster""" self._model_list = [ 'arima', 'sarima', 'exponential smoothing', 'prophet', 'dlm', 'linear' ] self.ensemble = list(map(lambda x: x.lower(), ensemble)) self.dict_hyper_params = dict_hyper_params self.show_plots = show_plots self._dict_models = dict() # dict.fromkeys(self.ensemble, None) self._best_models = dict() self.best_ensemble = dict() self._ensemble_logger = Logger("ensemble") try: super(EnsembleForecaster, self).__init__(**kwds) except (TypeError, AttributeError) as e: self._ensemble_logger.exception("Arguments missing...{}".format(e)) self._id = 'Ensemble' # if 'prophet' in self.ensemble: self._dict_models['prophet'] = self.__copy__() self._dict_models['prophet'].__class__ = ProphetForecaster if 'linear' in self.ensemble: self._dict_models['linear'] = self.__copy__() self._dict_models['linear'].__class__ = LinearForecaster if 'arima' in self.ensemble: self._dict_models['arima'] = self.__copy__() self._dict_models['arima'].__class__ = ARIMAForecaster if 'sarima' in self.ensemble: self._dict_models['sarima'] = self.__copy__() self._dict_models['sarima'].__class__ = SARIMAForecaster if 'exponential smoothing' in self.ensemble: self._dict_models['expsm'] = self.__copy__() self._dict_models[ 'expsm'].__class__ = ExponentialSmoothingForecaster if 'dlm' in self.ensemble: self._dict_models['dlm'] = self.__copy__() self._dict_models['dlm'].__class__ = DLMForecaster if 'auto_arima' in self.ensemble: self._dict_models['auto_arima'] = self.__copy__() self._dict_models['auto_arima'].__class__ = AutoARIMAForecaster if 'all' in self.ensemble: self._dict_models['prophet'] = self.__copy__() self._dict_models['prophet'].__class__ = ProphetForecaster self._dict_models['linear'] = self.__copy__() self._dict_models['linear'].__class__ = LinearForecaster self._dict_models['arima'] = self.__copy__() self._dict_models['arima'].__class__ = ARIMAForecaster self._dict_models['sarima'] = self.__copy__() self._dict_models['sarima'].__class__ = SARIMAForecaster self._dict_models['expsm'] = self.__copy__() self._dict_models[ 'expsm'].__class__ = ExponentialSmoothingForecaster self._dict_models['dlm'] = self.__copy__() self._dict_models['dlm'].__class__ = DLMForecaster self._dict_models['auto_arima'] = self.__copy__() self._dict_models['auto_arima'].__class__ = AutoARIMAForecaster self.assertions() def assertions(self): try: assert isinstance(self.dict_hyper_params, dict) except AssertionError: self._ensemble_logger.exception( "Assertion exception occurred, dict expected") sys.exit("STOP") # """ len_keys = list(filter(lambda x: x in list(self.dict_hyper_params.keys()), keys_f(keys=self.ensemble))) try: assert len(len_keys) == len(self.ensemble) except AssertionError: self._dlm_logger.warning("hyper parameters found only for " + len_keys + " our of " + len(self.ensemble)) """ for k, v in self._dict_models.items(): try: assert self._dict_models[k].n_test > 0 and self._dict_models[ k].n_val > 0 except AssertionError: self._ensemble_logger.exception( "Assertion exception occurred, both test and validation " "have to be generated! Please specify n_test and n_val!") sys.exit("STOP") def __copy__(self): """Copies the object""" result = super(EnsembleForecaster, self).__copy__() # result.ensemble = self.ensemble result.dict_hyper_params = self.dict_hyper_params result._dict_models = self._dict_models result._best_models = self._best_models result._ensemble_logger = self._ensemble_logger result._model_list = self._model_list return result def ts_fit(self, suppress=False): """Grid search on all forecasters in ensemble to find the best model out of hyper parameters provided. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ for k, v in self._dict_models.items(): if k in list(self.dict_hyper_params.keys()): self._gs.set_forecaster(self._dict_models[k]) self._gs.set_hyper_params(self.dict_hyper_params[k]) self._ensemble_logger.info( "==========================================Starting grid search for the forecaster +++ {} +++ ==================================" .format(k)) self._gs = self._gs.grid_search(suppress=suppress, show_plot=self.show_plots) # self._best_models[k] = self._gs.best_model else: self._dict_models[k].ts_fit(suppress=suppress) if k not in list(self._best_models.keys()): self._best_models[k] = dict() self._best_models[k]['forecaster'] = self._dict_models[k] return self def ts_test(self, show_plot=True): """Test all models on test data Parameters: ---------- show_plot: bool Whether to show or not the residual plots """ for k, v in self._best_models.items(): self._ensemble_logger.info( "==========================================Testing model +++ {} +++ ================================== " .format(k)) if 'hyper_params' in list(self._best_models[k].keys()): self._best_models[k]['forecaster'].set_params( p_dict=self._best_models[k]['hyper_params']) self._best_models[k]['forecaster'].ts_test(show_plot=show_plot) self._build_ensemble() self._plot_ensemble() def plot_residuals(self): """Plot the residuals""" if self._best_models is None or len(self._best_models) == 0: self._ensemble_logger.warning( "No models have been fit. The forecaster will stop!") sys.exit("STOP") for k, v in self._best_models.items(): self._best_models[k]['forecaster'].plot_residuals() def ts_diagnose(self): """Plot the residuals""" if self._best_models is None or len(self._best_models) == 0: self._ensemble_logger.warning( "No models have been fit. The forecaster will stop!") sys.exit("STOP") for k, v in self._best_models.items(): self._best_models[k]['forecaster'].ts_diagnose() @staticmethod def _print_dict(d): e_info = "" for k, v in d.items(): e_info = e_info + "....................... | ensemble | INFO : " + str( k) + " : " + str(v) + "\n" return "Best ensemble: \n" + e_info @staticmethod def lambda_forecast(x): if isinstance(x, ProphetForecaster): return x.forecast.iloc[:, -1].values else: return x.forecast.values def _compute_ensemble(self, compute_rmse=False): """Re-computes 'ensemble_forecast' for best_ensemble""" if self.best_ensemble['aggregation'] == 'none': self.best_ensemble['ensemble_forecast'] = pd.Series( self.lambda_forecast(self.best_ensemble['models'][0]), index=self.best_ensemble['models'][0].forecast.index) elif self.best_ensemble['aggregation'] == 'mean': self.best_ensemble['ensemble_forecast'] = \ pd.Series(np.mean(list(map(lambda x: self.lambda_forecast(x), self.best_ensemble['models'])), axis=0), index=self.best_ensemble['models'][0].forecast.index) # rmse if compute_rmse: ensemble_res_mean = np.mean(list( map(lambda x: x.residuals_forecast, self.best_ensemble['models'])), axis=0) self.best_ensemble['rmse'] = np.sqrt( np.square(ensemble_res_mean)).mean() elif self.best_ensemble['aggregation'] == 'median': self.best_ensemble['ensemble_forecast'] = \ pd.Series(np.median(list(map(lambda x: self.lambda_forecast(x), self.best_ensemble['models'])), axis=0), index=self.best_ensemble['models'][0].forecast.index) if compute_rmse: ensemble_res_median = np.median(list( map(lambda x: x.residuals_forecast, self.best_ensemble['models'])), axis=0) self.best_ensemble['rmse'] = np.sqrt( np.square(ensemble_res_median)).mean() def _build_ensemble(self): """ # check that validation has been run for k, v in self._best_models.items(): if self._best_models[k]['forecaster']._mode != 'test and validate': # do what ts_validate does self._best_models[k]['forecaster'].set_params(p_dict=self._best_models[k]['hyper_params']) self._ensemble_logger.info("Validating model {}".format(k)) self._best_models[k]['forecaster'].ts_validate(suppress=suppress, show_plot=show_plot) else: pass """ # build ensemble self._ensemble_logger.info( "==========================================Start building the best ensemble==========================================" ) rmse = np.float('Inf') mod_list = list(self._best_models.keys()) for L in range(0, len(mod_list) + 1): for subset in itertools.combinations(mod_list, L): if len(subset) == 0: pass if len(subset) > 1: # ensemble_candidate = [ self._best_models[s]['forecaster'] for s in subset ] # mean: note, residuals_forecast is now (each time) over the validation data ensemble_res_mean = np.mean(list( map(lambda x: x.residuals_forecast, ensemble_candidate)), axis=0) if np.sqrt(np.square(ensemble_res_mean)).mean() < rmse: rmse = np.sqrt(np.square(ensemble_res_mean)).mean() self.best_ensemble['rmse'] = rmse self.best_ensemble['set'] = subset self.best_ensemble['models'] = ensemble_candidate self.best_ensemble['aggregation'] = 'mean' # median ensemble_res_median = np.median(list( map(lambda x: x.residuals_forecast, ensemble_candidate)), axis=0) if np.sqrt(np.square(ensemble_res_median)).mean() < rmse: rmse = np.sqrt(np.square(ensemble_res_median)).mean() self.best_ensemble['rmse'] = rmse self.best_ensemble['set'] = subset self.best_ensemble['models'] = ensemble_candidate self.best_ensemble['aggregation'] = 'median' elif len(subset) == 1: ensemble_candidate = self._best_models[ subset[0]]['forecaster'] if ensemble_candidate.rmse < rmse: rmse = ensemble_candidate.rmse self.best_ensemble['rmse'] = rmse self.best_ensemble['set'] = subset self.best_ensemble['models'] = [ensemble_candidate] self.best_ensemble['aggregation'] = 'none' self._compute_ensemble() # self._ensemble_logger.info("The best ensemble found as:") print(self._print_dict(self.best_ensemble)) def _plot_ensemble(self): """Plots the best ensemble""" if len(self.best_ensemble) == 0: self._ensemble_logger.error( "Ensemble does not exist yet! Forecaster will stop!") sys.exit("STOP") plt.figure(figsize=(20, 7)) # plt.plot(self.best_ensemble['models'][0].ts_df, color='b') # colours colors = mcolors.BASE_COLORS by_hsv = sorted( (tuple(mcolors.rgb_to_hsv(mcolors.to_rgb(color))), colo_name) for colo_name, color in colors.items()) colo_names = [name for hsv, name in by_hsv] if 'w' in colo_names: colo_names.remove('w') if 'b' in colo_names: colo_names.remove('b') if 'g' in colo_names: colo_names.remove('g') if 'darkgreen' in colo_names: colo_names.remove('darkgreen') colo_names = sample(colo_names, len(self.best_ensemble['models'])) # for i in range(len(self.best_ensemble['models'])): plt.plot(pd.Series( self.lambda_forecast(self.best_ensemble['models'][i]), index=self.best_ensemble['models'][i].forecast.index), color=colo_names[i], linewidth=2.0, label=str(type(self.best_ensemble['models'][i])).split( "'")[1].split('.')[2]) plt.plot(self.best_ensemble['ensemble_forecast'], color='darkgreen', linewidth=2.0, label='Ensemble') plt.axvline(x=min(self.best_ensemble['ensemble_forecast'].index), color='grey', linestyle='dashed') plt.legend() plt.title("Real (blue) and forecasted values") # plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_validate(self, suppress=True, show_plot=True): """Validate best ensemble.""" if self.best_ensemble is None or len(self.best_ensemble) == 0: self._ensemble_logger.error( "Ensemble has not been built! Forecaster will stop!") sys.exit("STOP") for i in range(len(self.best_ensemble['models'])): self.best_ensemble['models'][i]._mode = 'test and validate' self.best_ensemble['models'][i].ts_fit(suppress=suppress) self.best_ensemble['models'][i].ts_test(show_plot=show_plot) self._compute_ensemble(compute_rmse=True) print(self._print_dict(self.best_ensemble)) self._plot_ensemble() def ts_forecast(self, n_forecast, features_dict=None, suppress=False): if self.best_ensemble is None or len(self.best_ensemble) == 0: self._ensemble_logger.error( "Ensemble has not been built! Forecaster will stop!") sys.exit("STOP") for i in range(len(self.best_ensemble['models'])): if str(type(self.best_ensemble['models'][i])).split("'")[1].split( '.')[2] != 'DLMForecaster': self.best_ensemble['models'][i].ts_forecast( n_forecast=n_forecast, suppress=suppress) else: self.best_ensemble['models'][i].ts_forecast( n_forecast=n_forecast, features_dict=features_dict, suppress=suppress) self._compute_ensemble() self._plot_ensemble()
class LinearForecaster(UVariateTimeSeriesClass): """Univariate time series child class using LinearRegression for forecasting Attributes ---------- _fit_intercept: bool Whether to fit the intercept yes/no _normalize: bool Whether to normalize time series data before fitting yes/no _copy_X: bool If True, X will be copied; else, it may be overwritten. _n_jobs: int or None The number of jobs to use for the computation. This will only provide speedup for n_targets > 1 and sufficient large problems. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. Methods ---------- assertions() Assertion tests, must be overrided set_params() Sets new parameter values get_params_dict() Gets parameter values as a dictionary ts_fit() Fits the auto_arima model to time series ts_diagnose() Diagnoses the fitted model plot_residuals() Generates residual plots ts_test() Evaluates fitted model on the test data, if this one has been generated ts_forecast() Forecasts time series and plots the results plot_forecasts() Plots forecasted time-series """ def __init__(self, fit_intercept=True, normalize=False, copy_X=False, n_jobs=None, **kwds): """Initializes the object LinearForecaster""" self._lin_logger = Logger('linear') try: super(LinearForecaster, self).__init__(**kwds) except TypeError: self._lin_logger.exception("Arguments missing...") self._fit_intercept = fit_intercept self._normalize = normalize self._copy_X = copy_X self._n_jobs = n_jobs self.intercept = None self.slope = None self._id = 'Linear' def __copy__(self): """Copies the object""" result = super(LinearForecaster, self).__copy__() result._fit_intercept = self._fit_intercept result._normalize = self._normalize result._copy_X = self._copy_X result._n_jobs = self._n_jobs result.intercept = self.intercept result.slope = self.slope result._lin_logger = self._lin_logger return result def set_params(self, p_dict=None, **kwargs): """Sets new parameters""" params_dict = kwargs if p_dict is not None: params_dict = p_dict # for k, v in params_dict.items(): if k == 'ts_df': self.ts_df = v elif k == 'freq': self.freq = v elif k == 'n_test': self.n_test = v elif k == 'n_val': self.n_val = v elif k == 'timeformat': self.time_format = v elif k == 'fit_intercept': self._fit_intercept = v elif k == 'normalize': self._normalize = v elif k == 'copy_X': self._copy_X = v elif k == 'n_jobs': self._n_jobs = v return self def get_params_dict(self): """Gets parameters as dictionary""" return {'fit_intercept': self._fit_intercept, 'normalize': self._normalize, 'copy_X': self._copy_X, 'n_jobs': self._n_jobs } def ts_fit(self, suppress=False): """Fit LinearRegression to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ self._prepare_fit() self.ts_split() ts_df = self._train_dt.copy() # x = np.arange(0, len(ts_df)).reshape(-1, 1) y = np.asarray(ts_df['y']) # Fit self._lin_logger.info("Trying to fit the linear model....") # tic start = time() try: if not suppress: self._lin_logger.info("...via using parameters") print_attributes(self) self.model_fit = LinearRegression(fit_intercept=self._fit_intercept, normalize=self._normalize, copy_X=self._copy_X, n_jobs=self._n_jobs).fit(x, y) # toc self._lin_logger.info("Time elapsed: {} sec.".format(time() - start)) except (Exception, ValueError): self._lin_logger.exception("LinearRegression error...") else: # self._lin_logger.info("Model successfully fitted to the data!") if not suppress: self._lin_logger.info("R^2: {:f}".format(self.model_fit.score(x, y))) # self.intercept = self.model_fit.intercept_ self.slope = self.model_fit.coef_ # Fitted values self._lin_logger.info("Computing fitted values and residuals...") self.fittedvalues = pd.Series(self.model_fit.predict(x), index=ts_df.index) # Residuals super(LinearForecaster, self)._residuals() self._lin_logger.info("Done.") return self def ts_diagnose(self): """Diagnoses the model""" try: assert self.model_fit is not None except AssertionError: self._lin_logger.exception("Model has to be fitted first! Please call ts_fit(...)") sys.exit("STOP") self.plot_residuals() def plot_residuals(self): """Plot the residuals""" fig, axis = super(LinearForecaster, self)._plot_residuals(y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues), _id="Linear") plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_test(self, show_plot=True): """Test the fitted model if test data available""" if super(LinearForecaster, self)._check_ts_test() < 0: return n_forecast = len(self._test_dt) self._lin_logger.info("Evaluating the fitted Linear model on the test data...") x_future = np.arange(len(self._train_dt), len(self._train_dt) + n_forecast).reshape(-1, 1) self.forecast = pd.Series(self.model_fit.predict(x_future), index=self._test_dt.index) self.residuals_forecast = pd.Series(np.asarray(self._test_dt['y']) - np.asarray(self.forecast), index=self._test_dt.index) self.measure_rmse() self._lin_logger.info("RMSE on test data: {}".format(self.rmse)) # plot if show_plot: self.plot_forecast() return self def ts_forecast(self, n_forecast, suppress=False): """Forecast time series over time frame in the future specified via n_forecast""" # n_forecast = super(LinearForecaster, self)._check_ts_forecast(n_forecast) # if not suppress: self._lin_logger.info("Fitting using all data....") self._mode = 'forecast' self.ts_fit(suppress=suppress) self._lin_logger.info("Forecasting next " + str(n_forecast) + str(self.freq)) # x_future = np.arange(len(self._train_dt), len(self._train_dt) + n_forecast).reshape(-1, 1) future = self.model_fit.predict(x_future) idx_future = self._gen_idx_future(n_forecast=n_forecast) self.forecast = pd.Series(future, index=idx_future) self.residuals_forecast = None self.plot_forecast() return self def plot_forecast(self): """Plot forecasted values""" fig, axis = super(LinearForecaster, self)._plot_forecast(y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues), forecast=self.forecast, _id='Linear') plt.gcf().autofmt_xdate() plt.grid(True) plt.show()
class ProphetForecaster(UVariateTimeSeriesClass): """Univariate time series child class using Prophet for forecasting,ref. to https://facebook.github.io/prophet Attributes ---------- _prophet_interval_width: float The width of the uncertainty intervals (by default 80%), also ref. to https://facebook.github.io/prophet/docs/uncertainty_intervals.html _yearly_seasonality: bool Consider yearly seasonality yes/no _monthly_seasonality: bool Consider monthly seasonality yes/no _quarterly_seasonality: bool Consider quarterly seasonality yes/no _weekly_seasonality: Consider weekly seasonality yes/no _daily_seasonality: bool Consider daily seasonality yes/no _weekend_seasonality: bool# Consider week-end seasonality yes/no. ref. to https://facebook.github.io/prophet/docs/seasonality,_holiday_effects,_and_regressors.html#modeling-holidays-and-special-events _changepoint_prior_scale: float If the trend changes are being overfit (too much flexibility) or underfit (not enough flexibility), you can adjust the strength of the sparse prior using this argument. By default, this parameter is set to 0.05. Increasing it will make the trend more flexible. Decreasing it will make the trend less flexible. ref. to https://facebook.github.io/prophet/docs/trend_changepoints.html#automatic-changepoint-detection-in-prophet _changepoint_range: float By default changepoints are only inferred for the first 80% of the time series in order to have plenty of runway for projecting the trend forward and to avoid overfitting fluctuations at the end of the time series. This default works in many situations but not all, and can be changed using the changepoint_range argument. For example, m = Prophet(changepoint_range=0.9) will place potential changepoints in the first 90% of the time series. ref. to https://facebook.github.io/prophet/docs/trend_changepoints.html#automatic-changepoint-detection-in-prophet _add_change_points: bool Whether to add change points to the plots ref. to https://facebook.github.io/prophet/docs/trend_changepoints.html#automatic-changepoint-detection-in-prophet _diagnose: bool Whether to run cross validation yes/no _history: str Amount of historic data in days for cross validation, Corresponds to initial in https://facebook.github.io/prophet/docs/diagnostics.html _step: str Correspons to period in the linke above. Defines step in days to shift the historic data _horizon: str Forecasting horizon in days for each cross validation run _consider_holidays: bool Whether to consider holiodays yes/no ref. to https://facebook.github.io/prophet/docs/seasonality,_holiday_effects,_and_regressors.html#modeling-holidays-and-special-events _country: str The country for which holidays are to be considered _prophet_logger: Logger The logger for logging Methods ---------- assertions() Assertion tests, must be overrided set_params() Sets new parameter values get_params_dict() Gets parameter values as a dictionary ts_fit() Fits the auto_arima model to time series ts_diagnose() Diagnoses the fitted model. Cross validation is started plot_residuals() Generates residual plots ts_test() Evaluates fitted model on the test data, if this one has been generated ts_forecast() Forecasts time series and plots the results plot_forecasts() Plots forecasted time-series """ def __init__(self, prophet_interval_width=0.95, yearly_seasonality=False, monthly_seasonality=False, quarterly_seasonality=False, weekly_seasonality=False, daily_seasonality=False, weekend_seasonality=False, changepoint_prior_scale=0.001, changepoint_range=0.9, add_change_points=True, diagnose=False, history=None, step=None, horizon=None, consider_holidays=True, country='DE', **kwds): """Initializes the object ProphetForecaster""" self._prophet_logger = Logger('prophet') try: super(ProphetForecaster, self).__init__(**kwds) except TypeError: self._prophet_logger.exception("TypeError occurred, Arguments missing") self._model = None self._prophet_interval_width = prophet_interval_width self._yearly_seasonality = yearly_seasonality self._monthly_seasonality = monthly_seasonality self._quarterly_seasonality = quarterly_seasonality self._weekly_seasonality = weekly_seasonality self._daily_seasonality = daily_seasonality self._weekend_seasonality = weekend_seasonality self._changepoint_prior_scale = changepoint_prior_scale self._changepoint_range = changepoint_range self._add_change_points = add_change_points self._diagnose = diagnose self._history = history self._step = step self._horizon = horizon self._prophet_cv = None self._prophet_p = None self._consider_holidays = consider_holidays self._country = country self._id = 'Prophet' def __copy__(self): """Copies the object""" result = super(ProphetForecaster, self).__copy__() # result._model = self._model result._prophet_interval_width = self._prophet_interval_width result._yearly_seasonality = self._yearly_seasonality result._monthly_seasonality = self._monthly_seasonality result._quarterly_seasonality = self._quarterly_seasonality result._weekly_seasonality = self._weekly_seasonality result._daily_seasonality = self._daily_seasonality result._weekend_seasonality = self._weekend_seasonality result._changepoint_prior_scale = self._changepoint_prior_scale result._changepoint_range = self._changepoint_range result._add_change_points = self._add_change_points result._diagnose = self._diagnose result._history = self._history result._step = self._step result._horizon = self._horizon result._prophet_cv = self._prophet_cv result._prophet_p = self._prophet_p result._consider_holidays = self._consider_holidays result._country = self._country result._prophet_logger = self._prophet_logger return result def set_params(self, p_dict=None, **kwargs): """Sets new parameters""" params_dict = kwargs if p_dict is not None: params_dict = p_dict # for k, v in params_dict.items(): if k == 'ts_df': self.ts_df = v elif k == 'freq': self.freq = v elif k == 'n_test': self.n_test = v elif k == 'n_val': self.n_val = v elif k == 'timeformat': self.time_format = v elif k == "prophet_interval_width": self._prophet_interval_width = v elif k == "yearly_seasonality": self._yearly_seasonality = v elif k == "monthly_seasonality": self._monthly_seasonality = v elif k == "quarterly_seasonality": self._quarterly_seasonality = v elif k == "weekly_seasonality": self._weekly_seasonality = v elif k == "daily_seasonality": self._daily_seasonality = v elif k == "weekend_seasonality": self._weekend_seasonality = v elif k == "changepoint_prior_scale": self._changepoint_prior_scale = v elif k == "changepoint_range": self._changepoint_range = v elif k == "add_change_points": self._add_change_points = v elif k == "diagnose": self._diagnose = v elif k == "history": self._history = v elif k == "step": self._step = v elif k == "horizon": self._horizon = v elif k == "consider_holidays": self._consider_holidays = v elif k == "country": self._country = v return self def get_params_dict(self): """Gets parameters as a dictionary""" return {'prophet_interval_width': self._prophet_interval_width, 'yearly_seasonality': self._yearly_seasonality, 'monthly_seasonality': self._monthly_seasonality, 'quarterly_seasonality': self._quarterly_seasonality, 'weekly_seasonality': self._weekly_seasonality, 'daily_seasonality': self._daily_seasonality, 'weekend_seasonality': self._weekend_seasonality, 'changepoint_prior_scale': self._changepoint_prior_scale, 'changepoint_range': self._changepoint_range, 'add_change_points': self._add_change_points, 'diagnose': self._diagnose, 'history': self._history, 'step': self._step, 'horizon': self._horizon, 'consider_holidays': self._consider_holidays, 'country': self._country } @staticmethod def we_season(ds): """Lambda function to prepare weekend_seasonality for Prophet""" date = pd.to_datetime(ds) return date.weekday() >= 5 def ts_fit(self, suppress=False): """Fit Prophet to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ if self.hyper_params is not None: self._gs.set_forecaster(self) self._gs.set_hyper_params(self.hyper_params) # a very important command here to avoid endless loop self.hyper_params = None self._prophet_logger.info("***** Starting grid search *****") self._gs = self._gs.grid_search(suppress=suppress, show_plot=False) # self.best_model = self._gs.best_model self.__dict__.update(self.best_model['forecaster'].__dict__) self._prophet_logger.info("***** Finished grid search *****") else: self._prepare_fit() self._model = None self.ts_split() ts_df = self._train_dt.copy() ts_test_df = self._test_dt # sanity check if 'on_weekend' in ts_df.columns: ts_df.drop(['on_weekend', 'off_weekend'], inplace=True, axis=1) # ts_test_df.drop(['on_weekend', 'off_weekend'], inplace=True, axis=1) # Fit self._prophet_logger.info("Trying to fit the Prophet model....") try: if not suppress: self._prophet_logger.info("...via using parameters\n") print_attributes(self) # diagnose on? if self._diagnose: try: assert self._step is not None and self._horizon is not None except (KeyError, AssertionError): self._prophet_logger.warning("You want to diagnose the Prophet model. Please provide parameters " "'step' and 'horizon' within object initialization!") sys.exit("STOP") ts_df = ts_df.reset_index() ts_df.columns = self._ts_df_cols if ts_test_df is not None and not ts_test_df.empty: ts_test_df = ts_test_df.reset_index() ts_test_df.columns = self._ts_df_cols # weekly_s = self._weekly_seasonality if self._weekend_seasonality: # force to False weekly_s = False # if not self._consider_holidays: self._model = Prophet(interval_width=self._prophet_interval_width, yearly_seasonality=self._yearly_seasonality, weekly_seasonality=weekly_s, daily_seasonality=self._daily_seasonality, changepoint_range=self._changepoint_range, changepoint_prior_scale=self._changepoint_prior_scale) else: try: assert self._country in ['AT', 'DE', 'US'] except AssertionError: self._prophet_logger.exception("Assrtion exception occurred. Right now, Austria (AT), " "Germany(DE) and USA (US) supported.") sys.exit("STOP") else: holi = None if self._country == 'AT': holi = holidays.AT(state=None, years=list(np.unique(np.asarray(self.ts_df.index.year)))) elif self._country == 'DE': holi = holidays.DE(state=None, years=list(np.unique(np.asarray(self.ts_df.index.year)))) elif self._country == 'US': holi = holidays.US(state=None, years=list(np.unique(np.asarray(self.ts_df.index.year)))) # holi_dict = dict() for date, name in sorted(holi.items()): holi_dict[date] = name df_holi = pd.DataFrame.from_dict(data=holi_dict, orient='index').reset_index() df_holi.columns = ['ds', 'holiday'] df_holi['lower_window'] = 0 df_holi['upper_window'] = 0 self._model = Prophet(interval_width=self._prophet_interval_width, yearly_seasonality=self._yearly_seasonality, weekly_seasonality=weekly_s, daily_seasonality=self._daily_seasonality, changepoint_range=self._changepoint_range, changepoint_prior_scale=self._changepoint_prior_scale, holidays=df_holi) if self._monthly_seasonality: self._model.add_seasonality(name='monthly', period=30.5, fourier_order=20) if not suppress: self._prophet_logger.info("Added monthly seasonality.") if self._quarterly_seasonality: self._model.add_seasonality(name='quarterly', period=91.5, fourier_order=20) if not suppress: self._prophet_logger.info("Added quarterly seasonality.") if self._weekend_seasonality: ts_df['on_weekend'] = ts_df['ds'].apply(self.we_season) ts_df['off_weekend'] = ~ts_df['ds'].apply(self.we_season) self._train_dt = ts_df.copy() self._train_dt.set_index('ds', inplace=True) # if ts_test_df is not None and not ts_test_df.empty: ts_test_df['on_weekend'] = ts_test_df['ds'].apply(self.we_season) ts_test_df['off_weekend'] = ~ts_test_df['ds'].apply(self.we_season) self._test_dt = ts_test_df.copy() self._test_dt.set_index('ds', inplace=True) # and add self._model.add_seasonality(name='weekend_on_season', period=7, fourier_order=5, condition_name='on_weekend') self._model.add_seasonality(name='weekend_off_season', period=7, fourier_order=5, condition_name='off_weekend') if not suppress: self._prophet_logger.info("Added week-end seasonality.") # tic start = time() self.model_fit = self._model.fit(ts_df) # toc if not suppress: self._prophet_logger.info("Time elapsed: {} sec.".format(time() - start)) except (Exception, ValueError): self._prophet_logger.exception("Prophet error...") return -1 else: self._prophet_logger.info("Model successfully fitted to the data!") # Fitted values self._prophet_logger.info("Computing fitted values and residuals...") # in-sample predict try: self.fittedvalues = self._model.predict(ts_df.drop('y', axis=1)) except (Exception, ValueError): self._prophet_logger.exception("Prophet predict error...") # Residuals try: # use fittedvalues to fill in the model dictionary self.residuals = pd.Series(np.asarray(ts_df.y) - np.asarray(self.fittedvalues['yhat']), index=self._train_dt.index) except (KeyError, AttributeError): self._prophet_logger.exception("Model was not fitted or ts has other structure...") # self.lower_conf_int = pd.Series(np.asarray(self.fittedvalues['yhat_lower']), index=self._train_dt.index) self.upper_conf_int = pd.Series(np.asarray(self.fittedvalues['yhat_upper']), index=self._train_dt.index) self._prophet_logger.info("Done.") return self def ts_diagnose(self): """Diagnoses the fitted model""" try: assert self.model_fit is not None except AssertionError: self._prophet_logger.exception("Model has to be fitted first! Please call ts_fit(...)") sys.exit("STOP") self.plot_residuals() if self._diagnose: if input("Run cross validation y/n? Note, depending on parameters provided " "this can take some time...").strip().lower() == 'y': start = time() self._prophet_logger.info("Running cross validation using parameters provided....") if self._history is not None: try: self._prophet_cv = cross_validation(self.model_fit, initial=self._history, period=self._step, horizon=self._horizon) except Exception: self._prophet_logger.exception("Prophet cross validation error: check your " "parameters 'history', 'horizon', 'step'!") else: try: self._prophet_cv = cross_validation(self.model_fit, period=self._step, horizon=self._horizon) except Exception: self._prophet_logger.exception("Prophet cross validation error: " "check your parameters 'horizon', 'step'!") self._prophet_logger.info("Time elapsed: {}".format(time() - start)) simu_intervals = self._prophet_cv.groupby('cutoff')['ds'].agg( [('forecast_start', 'min'), ('forecast_till', 'max')]) self._prophet_logger.info("Following time windows and cutoffs have been set-up:\n") print(simu_intervals) # plot_cross_validation_metric(self._prophet_cv, metric='mape') # self._prophet_logger.info("Running performance metrics...") self._prophet_p = performance_metrics(self._prophet_cv) else: self._prophet_logger.info("OK") return def plot_residuals(self): """Plot the residuals""" fig, axes = super(ProphetForecaster, self)._plot_residuals( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues['yhat']), _id="Prophet") plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_test(self, show_plot=True): """Test the fitted model if test data available""" if super(ProphetForecaster, self)._check_ts_test() < 0: return self._prophet_logger.info("Evaluating the fitted Prophet model on the test data...") self.forecast = self._model.predict(self._test_dt.copy().reset_index().drop('y', axis=1)) # confidence intervals self.lower_conf_int = pd.concat([self.lower_conf_int, pd.Series(np.asarray(self.forecast['yhat_lower']), index=self._test_dt.index)], axis=0) self.upper_conf_int = pd.concat([self.upper_conf_int, pd.Series(np.asarray(self.forecast['yhat_upper']), index=self._test_dt.index)], axis=0) self.residuals_forecast = pd.Series(np.asarray(self._test_dt['y']) - np.asarray(self.forecast['yhat']), index=self._test_dt.index) self.measure_rmse() self._prophet_logger.info("RMSE on test data: {}".format(self.rmse)) # plot if show_plot: self.plot_forecast() def ts_forecast(self, n_forecast, suppress): """Forecast time series over time frame in the future specified via n_forecast""" # n_forecast = super(ProphetForecaster, self)._check_ts_forecast(n_forecast) # self._prophet_logger.info("Fitting using all data....") self._mode = 'forecast' self.ts_fit(suppress=suppress) self._prophet_logger.info("Forecasting next " + str(n_forecast) + str(self.ts_df.index.freq)) # future = self._model.make_future_dataframe(periods=n_forecast, freq=self.freq) if self._weekend_seasonality: future['on_weekend'] = future['ds'].apply(self.we_season) future['off_weekend'] = ~future['ds'].apply(self.we_season) self.forecast = self._model.predict(future) # confidence intervals self.lower_conf_int = pd.concat([self.lower_conf_int, pd.Series(np.asarray(self.forecast['yhat_lower']), index=future.ds)], axis=0) self.upper_conf_int = pd.concat([self.upper_conf_int, pd.Series(np.asarray(self.forecast['yhat_upper']), index=future.ds)], axis=0) self.residuals_forecast = None self.plot_forecast() def plot_forecast(self): """Plot forecasted values""" if self.residuals_forecast is not None: fig, axes = super(ProphetForecaster, self)._plot_forecast(y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues['yhat']), forecast=pd.Series( np.asarray(self.forecast['yhat']), index=self.forecast['ds']), _id='Prophet') else: fig_forecast = self._model.plot(self.forecast) fig_components = self._model.plot_components(self.forecast) if self._add_change_points: a = add_changepoints_to_plot(fig_forecast.gca(), self._model, self.forecast) plt.gcf().autofmt_xdate() plt.grid(True) plt.show()
class UVariateTimeSeriesForecaster(LinearForecaster, AutoARIMAForecaster, SARIMAForecaster, ExponentialSmoothingForecaster, ProphetForecaster, DLMForecaster): """Univariate time series class inheriting from all existing forecasters and choosing the best forecaster. Attributes ---------- forecasters: list List of all forecasters to be used. The best one will be chosen as the final model. The goodness of measure is rmse of a model on test data. Note, that it is necessary to generate the test data. _dict_models: dictionary Dictionary keeping all models best_model: Object The best model _uvtsf_logger: Logger The logger for logging Methods ---------- assertions() Assertion tests, must be overrided ts_fit() Fits all forecasters to time series ts_test() Test all forecasters on test data and computes rmse _select_best() Helper function to select the best model select_best() Fits all forecasters to time series and selects the best one based on rmse of each computed on test data. If these was no test data, no test is done and no model is selected plot_residuals() Plots residuals for the best model ts_forecast() Forecasts time series and plots the results using the best model plot_forecasts() Plots forecasted time-series """ def __init__(self, forecasters=['all'], **kwds): """Initialized the object UVariateTimeSeriesForecaster""" self.forecasters = list(map(lambda x: x.lower(), forecasters)) self._dict_models = dict() # .fromkeys(self._model_list, None) self.best_model = None self._uvtsf_logger = Logger("uvtsf") # try: super(UVariateTimeSeriesForecaster, self).__init__(**kwds) except TypeError: self._uvtsf_logger.exception("Arguments missing...") self._id = 'ts_forecaster' # if 'prophet' in self.forecasters: self._dict_models['prophet'] = self.__copy__() self._dict_models['prophet'].__class__ = ProphetForecaster if 'linear' in self.forecasters: self._dict_models['linear'] = self.__copy__() self._dict_models['linear'].__class__ = LinearForecaster if 'arima' in self.forecasters: self._dict_models['arima'] = self.__copy__() self._dict_models['arima'].__class__ = ARIMAForecaster if 'sarima' in self.forecasters: self._dict_models['sarima'] = self.__copy__() self._dict_models['sarima'].__class__ = SARIMAForecaster if 'auto_arima' in self.forecasters: self._dict_models['auto_arima'] = self.__copy__() self._dict_models['auto_arima'].__class__ = AutoARIMAForecaster if 'exponential smoothing' in self.forecasters: self._dict_models['expsm'] = self.__copy__() self._dict_models[ 'expsm'].__class__ = ExponentialSmoothingForecaster if 'dlm' in self.forecasters: self._dict_models['dlm'] = self.__copy__() self._dict_models['dlm'].__class__ = DLMForecaster if 'all' in self.forecasters: self._dict_models['prophet'] = self.__copy__() self._dict_models['prophet'].__class__ = ProphetForecaster self._dict_models['linear'] = self.__copy__() self._dict_models['linear'].__class__ = LinearForecaster self._dict_models['arima'] = self.__copy__() self._dict_models['arima'].__class__ = ARIMAForecaster self._dict_models['sarima'] = self.__copy__() self._dict_models['sarima'].__class__ = SARIMAForecaster self._dict_models['auto_arima'] = self.__copy__() self._dict_models['auto_arima'].__class__ = SARIMAForecaster self._dict_models['expsm'] = self.__copy__() self._dict_models[ 'expsm'].__class__ = ExponentialSmoothingForecaster self._dict_models['dlm'] = self.__copy__() self._dict_models['dlm'].__class__ = DLMForecaster self.assertions() def __copy__(self): """Copies the object""" result = super(UVariateTimeSeriesForecaster, self).__copy__() # result.forecasters = self.forecasters result._dict_models = self._dict_models result.best_model = self.best_model return result def assertions(self): try: assert isinstance(self.forecasters, list) except AssertionError: self._uvtsf_logger.exception( "Assertion exception occurred, list expected for forecasters") sys.exit("STOP") for k, v in self._dict_models.items(): try: assert self._dict_models[k].n_test > 0 except AssertionError: self._uvtsf_logger.exception( "Assertion exception occurred, no test data was generated! " "This forecaster requires the test data") sys.exit("STOP") def ts_fit(self, suppress=False): """Fit all forecasters to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ for k, v in self._dict_models.items(): if self._dict_models[k] is not None: self._dict_models[k].ts_fit(suppress=suppress) return self def ts_diagnose(self): """Diagnoses all candidate models""" for k, v in self._dict_models.items(): if self._dict_models[k].model_fit is not None: self._dict_models[k].ts_diagnose() def ts_test(self, show_plot=True): """Test the fitted model if test data available""" for k, v in self._dict_models.items(): if self._dict_models[k].model_fit is not None: self._dict_models[k].ts_test(show_plot=show_plot) self._select_best() def _select_best(self): """Helper function to select the best model among fitted forecasters""" rmse = float('Inf') for k, v in self._dict_models.items(): if self._dict_models[k].model_fit is not None: if self._dict_models[k].rmse < rmse: rmse = self._dict_models[k].rmse self.best_model = self._dict_models[k] if self.best_model is not None: self._uvtsf_logger.info("The best model selected as: {}".format( str(type(self.best_model)).split('\'')[1].split('.')[2])) else: self._uvtsf_logger.warning( "No model has been fitted! Please call ts_fit()...") """ def select_best(self, suppress=False): Fit all forecasters and select the best model self.ts_fit(suppress=suppress) self.ts_test() return self """ def plot_residuals(self): """Residual plots""" if self.best_model is not None and bool(self.best_model): self.best_model.plot_residuals() else: for k, v in self._dict_models.items(): if self._dict_models[k].model_fit is not None: self._dict_models[k].plot_residuals() def ts_validate(self, suppress=True, show_plot=True): """Validates the best model""" if self.best_model is not None: self.best_model.ts_validate(suppress=suppress, show_plot=show_plot) else: self._uvts_cls_logger.warning( "No model has been selected yet! Run ts_test() first, or restart." ) sys.exit("STOP") def ts_forecast(self, n_forecast, suppress=False): """Forecast n_forecast steps in the future using the best model""" if self.best_model is not None: self.best_model.ts_forecast(n_forecast=n_forecast, suppress=suppress) else: self._uvtsf_logger.warning( "No model has been selected! Please call ts_test()...") return self def plot_forecast(self): """Plots forecasted values""" if self.best_model is not None: self.best_model.plot_forecast() else: self._uvtsf_logger.warning( "No model has been selected! Please call ts_fit()...")
class DLMForecaster(UVariateTimeSeriesClass): """Univariate time series child class using DLM of pydlm for forecasting, ref. to https://pydlm.github.io/pydlm_user_guide.html Attributes ---------- _dlm_trend: tuple A tuple of degree, discount, name and prior covariance _dlm_seasonality: tuple A tuple of period, discount, name and prior covariance _dlm_dynamic: dictionary A dictionary of tuples as features, discount, name and prior covariance. Note, the features for _dynamic should be a list of lists. _dlm_auto_reg: tuple A tuple of degree, discount, name and prior covariance _dlm_long_season: tuple A tuple of period, stay, name and prior covariance _use_rolling_window: bool Use rolling window in forward filtering yes/no _window_size: int _dlm_interval_width: float TBD _dlm_logger: Logger The logger for logging Methods ---------- assertions() Assertion tests, must be overrided set_params() Sets new parameter values get_params_dict() Gets parameter values as a dictionary ts_fit() Fits the auto_arima model to time series ts_diagnose() Diagnoses the fitted model plot_dlm() Plot pydlm native plots plot_residuals() Generates residual plots ts_test() Evaluates fitted model on the test data, if this one has been generated ts_forecast() Forecasts time series and plots the results plot_forecasts() Plots forecasted time-series """ def __init__(self, dlm_trend=None, dlm_seasonality=None, dlm_dynamic=None, dlm_auto_reg=None, dlm_long_season=None, use_rolling_window=False, window_size=0, dlm_interval_width=0.95, **kwds): """Initializes the object DLMForecaster""" if dlm_trend is None: dlm_trend = { 'degree': 0, 'discount': 0.99, 'name': 'trend1', 'w': 1e7 } self._model = None self.mse = None self._dlm_trend = dlm_trend self._dlm_seasonality = dlm_seasonality self._dlm_dynamic = dlm_dynamic self._dlm_auto_reg = dlm_auto_reg self._dlm_long_season = dlm_long_season self._use_rolling_window = use_rolling_window self._window_size = window_size self._dlm_interval_width = dlm_interval_width self._dlm_logger = Logger('dlm') self.assertions() try: super(DLMForecaster, self).__init__(**kwds) except TypeError: self._dlm_logger.exception("TypeError occurred, Arguments missing") self._id = 'DLM' self._train_dlm_dynamic = None # features self._test_dlm_dynamic = None # featureDict self._val_dlm_dynamic = None # featureDict def assertions(self): if self._dlm_trend is not None: try: assert isinstance(self._dlm_trend, dict) except AssertionError: self._dlm_logger.exception( "Assertion exception occurred, dictionary expected for dlm_trend" ) sys.exit("STOP") else: len_keys = list( filter(lambda x: x in list(self._dlm_trend.keys()), keys_f(keys=['degree', 'discount', 'name']))) try: assert len(len_keys) == len(['degree', 'discount', 'name']) except AssertionError: self._dlm_logger.exception( "Not all expected parameters found for trend. " "['degree', 'discount', 'name'] are necessary!") sys.exit("STOP") else: if 'w' not in list(self._dlm_trend.keys()): self._dlm_trend['w'] = 1e7 if self._dlm_seasonality is not None: try: assert isinstance(self._dlm_seasonality, dict) except AssertionError: self._dlm_logger.exception( "Assertion exception occurred, dictionary expected for dlm_seasonality" ) sys.exit("STOP") else: len_keys = list( filter(lambda x: x in list(self._dlm_seasonality.keys()), keys_f(keys=['period', 'discount', 'name']))) try: assert len(len_keys) == len(['period', 'discount', 'name']) except AssertionError: self._dlm_logger.exception( "Not all expected parameters found for seasonality. " "['period', 'discount', 'name] are necessary!") sys.exit("STOP") else: if 'w' not in list(self._dlm_seasonality.keys()): self._dlm_seasonality['w'] = 1e7 if self._dlm_auto_reg is not None: try: assert isinstance(self._dlm_auto_reg, dict) except AssertionError: self._dlm_logger.exception( "Assertion exception occurred, dictionary expected for dlm_auroReg" ) sys.exit("STOP") else: len_keys = list( filter(lambda x: x in list(self._dlm_auto_reg.keys()), keys_f(keys=['degree', 'discount', 'name']))) try: assert len(len_keys) == len(['degree', 'discount', 'name']) except AssertionError: self._dlm_logger.exception( "Not all expected parameters found for auto_reg. " "['degree', 'discount', 'name'] are necessary!") sys.exit("STOP") else: if 'w' not in list(self._dlm_auto_reg.keys()): self._dlm_auto_reg['w'] = 1e7 if self._dlm_long_season is not None: try: assert isinstance(self._dlm_long_season, dict) except AssertionError: self._dlm_logger.exception( "Assertion exception occurred, dictionary expected for dlm_longSeason" ) sys.exit("STOP") else: len_keys = list( filter(lambda x: x in list(self._dlm_long_season.keys()), keys_f(keys=['period', 'stay', 'name']))) try: assert len(len_keys) == len(['period', 'stay', 'name']) except AssertionError: self._dlm_logger.exception( "Not all expected parameters found for long season. " "['period', 'stay', 'name'] are necessary!") sys.exit("STOP") else: if 'w' not in list(self._dlm_long_season.keys()): self._dlm_long_season['w'] = 1e7 if self._dlm_dynamic is not None: try: assert isinstance(self._dlm_dynamic, dict) except AssertionError: self._dlm_logger.exception( "Assertion exception occurred, dictionary expected for dlm_seasonality" ) sys.exit("STOP") else: try: assert 'features' in list(self._dlm_dynamic.keys()) except AssertionError: self._dlm_logger.exception( "Assertion exception occurred, 'features' must be provided!" ) sys.exit("STOP") else: try: assert isinstance(self._dlm_dynamic['features'], list) except AssertionError: self._dlm_logger.exception( "Assertion exception occurred, list expected for 'features'" ) sys.exit("STOP") else: for i in range(len(self._dlm_dynamic['features'])): len_keys = list( filter( lambda x: x in list(self._dlm_dynamic[ 'features'][i].keys()), keys_f( keys=['features', 'discount', 'name' ]))) try: assert len(len_keys) == len( ['features', 'discount', 'name']) except AssertionError: self._dlm_logger.exception( "Not all expected parameters found for dynamic features. " "['features', 'discount', 'name'] are necessary!" ) sys.exit("STOP") # features must have same length with the data for i in range(len(self._dlm_dynamic['features'])): try: assert len(self._dlm_dynamic['features'][i] ['features']) == len(self.ts_df) except AssertionError: self._dlm_logger.exception( "Assertion exception occurred. All provided features must" " be of same length as your data!") sys.exit("STOP") else: if 'w' not in list( self._dlm_dynamic['features'][i].keys()): self._dlm_dynamic['features'][i]['w'] = 1e7 if self._use_rolling_window: try: assert self._window_size > 0 except AssertionError: self._dlm_logger.exception( "Assertion exception occurred, zero window_size. " "No rolling window will be used") self._use_rolling_window = False def __copy__(self): """Copies the object""" result = super(DLMForecaster, self).__copy__() # result._dlm_trend = self._dlm_trend result._dlm_seasonality = self._dlm_seasonality result._dlm_dynamic = self._dlm_dynamic result._dlm_auto_reg = self._dlm_auto_reg result._dlm_long_season = self._dlm_long_season result._use_rolling_window = self._use_rolling_window result._window_size = self._window_size result._dlm_interval_width = self._dlm_interval_width result._dlm_logger = self._dlm_logger return result def set_params(self, p_dict=None, **kwargs): """Sets new parameters""" params_dict = kwargs if p_dict is not None: params_dict = p_dict # for k, v in params_dict.items(): if k == 'ts_df': self.ts_df = v elif k == 'freq': self.freq = v elif k == 'n_test': self.n_test = v elif k == 'n_val': self.n_val = v elif k == 'timeformat': self.time_format = v elif k == "dlm_trend": self._dlm_trend = v elif k == "dlm_seasonality": self._dlm_seasonality = v elif k == "dlm_dynamic": self._dlm_dynamic = v elif k == "dlm_autoReg": self._dlm_auto_reg = v elif k == "dlm_longSeason": self._dlm_long_season = v # TBD other params!! self.assertions() return self def get_params_dict(self): """Gets parameters as dictionary""" return { 'dlm_trend': self._dlm_trend, 'dlm_seasonality': self._dlm_seasonality, 'dlm_dynamic': self._dlm_dynamic, 'dlm_auto_reg': self._dlm_auto_reg, 'dlm_long_season': self._dlm_long_season, 'use_rolling_window': self._use_rolling_window, 'window_size': self._window_size, 'dlm_interval_width': self._dlm_interval_width } def ts_split(self): """DLM extension of the parent ts_split() DLM needs to extend the ts_split of its parent class. The reason lies in dynamic features: this list of lists must be splitted """ # call super super(DLMForecaster, self).ts_split() if self._dlm_dynamic is None or self._mode == 'forecast': return self # split dynamic features test_feat_dict = dict() val_feat_dict = dict() self._train_dlm_dynamic = self._dlm_dynamic for i in range(len(self._dlm_dynamic)): feats = self._dlm_dynamic['features'][i]['features'] # if self._mode == 'test and validate': if self._test_dlm_dynamic is not None: self._train_dlm_dynamic['features'][i]['features'].append( self._test_dlm_dynamic[self._dlm_dynamic['features'][i] ['name']]) self._val_dlm_dynamic = self._test_dlm_dynamic else: self._dlm_logger.error("Something is wrong, mode!") else: if self._mode == 'test' and self.n_val == 0: self._train_dlm_dynamic['features'][i][ 'features'] = feats[:(len(feats) - 1 - self.n_test)] # test_feat_dict[self._dlm_dynamic['features'][i] ['name']] = feats[(len(feats) - self.n_test):] elif self._mode == 'validate': self._train_dlm_dynamic['features'][i][ 'features'] = feats[:(len(feats) - 1 - self.n_val)] # val_feat_dict[self._dlm_dynamic['features'][i] ['name']] = feats[(len(feats) - self.n_val):] elif self._mode == 'test' and self.n_val > 0: self._dlm_dynamic['features'][i]['features'] = feats[:( len(feats) - 1 - self.n_test - self.n_val)] # test_feat_dict[self._dlm_dynamic['features'][i]['name']] = \ feats[(len(feats) - self.n_test - self.n_val):(len(feats) - self.n_val - 1)] val_feat_dict[self._dlm_dynamic['features'][i] ['name']] = feats[(len(feats) - self.n_val):] # now set if len(test_feat_dict): self._test_dlm_dynamic = test_feat_dict if len(val_feat_dict): self._val_dlm_dynamic = val_feat_dict return self def ts_fit(self, suppress=False): """Fit DLM to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ self._prepare_fit() self._model = None self.ts_split() ts_df = self._train_dt.copy() # Fit self._dlm_logger.info("Trying to fit the DLM model....") try: if not suppress: self._dlm_logger.info("...via using parameters\n") print_attributes(self) ts_df = ts_df.reset_index() ts_df.columns = self._ts_df_cols self._model = dlm(ts_df['y']) # trend if self._dlm_trend is not None: self._model = self._model + trend( degree=self._dlm_trend['degree'], discount=self._dlm_trend['discount'], name=self._dlm_trend['name'], w=self._dlm_trend['w']) # seasonality if self._dlm_seasonality is not None: self._model = self._model + seasonality( period=self._dlm_seasonality['period'], discount=self._dlm_seasonality['discount'], name=self._dlm_seasonality['name'], w=self._dlm_seasonality['w']) # dynamic if self._train_dlm_dynamic is not None: for i in range(len(self._train_dlm_dynamic['features'])): self._model = self._model + dynamic( features=self._train_dlm_dynamic['features'][i] ['features'], discount=self._train_dlm_dynamic['features'][i] ['discount'], name=self._train_dlm_dynamic['features'][i]['name'], w=self._train_dlm_dynamic['features'][i]['w']) # auto_reg if self._dlm_auto_reg is not None: self._model = self._model + autoReg( degree=self._dlm_auto_reg['degree'], discount=self._dlm_auto_reg['discount'], name=self._dlm_auto_reg['name'], w=self._dlm_auto_reg['w']) # long_season if self._dlm_long_season is not None: ls = longSeason(period=self._dlm_long_season['period'], stay=self._dlm_long_season['stay'], data=ts_df, name=self._dlm_long_season['name'], w=self._dlm_long_season['w']) self._model = self._model + ls if not suppress: self._dlm_logger.info("The constructed DLM model components:") print(self._model.ls()) # tic start = time() if self._use_rolling_window: self._model.fitForwardFilter(useRollingWindow=True, windowLength=self._window_size) self._model.fitBackwardSmoother() else: self._model.fit() self.model_fit = self._model # toc if not suppress: self._dlm_logger.info("Time elapsed: {} sec.".format(time() - start)) except (Exception, ValueError) as e: self._dlm_logger.exception("DLM error...{}".format(e)) return -1 else: self._dlm_logger.info("Model successfully fitted to the data!") self._dlm_logger.info("Computing fitted values and residuals...") # Residuals self.residuals = pd.Series(self.model_fit.getResidual(), index=self._train_dt.index) try: self.lower_conf_int = pd.Series( self.model_fit.getInterval()[1], index=self._train_dt.index) self.upper_conf_int = pd.Series( self.model_fit.getInterval()[0], index=self._train_dt.index) except ValueError as e: self._dlm_logger.exception( "Something went wrong in getInterval...{}".format(e)) self.mse = self.model_fit.getMSE() # Fitted values # this is not elegant, but found no other way self.fittedvalues = self._train_dt['y'] + self.residuals return self def ts_diagnose(self): """Diagnoses the fitted model""" self.plot_residuals() def plot_dlm(self): """Plot pydlm native plots""" self.model_fit.plot("DLM native") def plot_residuals(self): """Plot the residuals.""" fig, axis = super(DLMForecaster, self)._plot_residuals( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues), _id="DLM") plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_test(self, show_plot=True): """Test the fitted model if test data available""" if super(DLMForecaster, self)._check_ts_test() < 0: return N = len(self._test_dt) self._dlm_logger.info( "Evaluating the fitted DLM model on the test data...") if self._test_dlm_dynamic is not None: (predictMean, predictVar) = self._model.predictN( N=N, date=self._model.n - 1, featureDict=self._test_dlm_dynamic) else: (predictMean, predictVar) = self._model.predictN(N=N, date=self._model.n - 1) self.forecast = pd.Series(np.asarray(predictMean), index=self._test_dt.index) # confidence intervals cl, cu = self.compute_ci(yhat=np.asarray(predictMean), yhat_var=np.asarray(predictVar), ci_level=self._dlm_interval_width) cl = pd.Series(cl, index=self._test_dt.index) cu = pd.Series(cu, index=self._test_dt.index) self.lower_conf_int = pd.concat([self.lower_conf_int, cl], axis=0) self.upper_conf_int = pd.concat([self.upper_conf_int, cu], axis=0) self.residuals_forecast = pd.Series(np.asarray(self._test_dt['y']) - np.asarray(self.forecast), index=self._test_dt.index) self.measure_rmse() self._dlm_logger.info("RMSE on test data: {}".format(self.rmse)) # plot if show_plot: self.plot_forecast() return self def ts_forecast(self, n_forecast, suppress=False, features_dict=None): """Forecast time series over time frame in the future specified via n_forecast""" # n_forecast = super(DLMForecaster, self)._check_ts_forecast(n_forecast) # self._dlm_logger.info("Fitting using all data....") self._mode = 'forecast' self.ts_fit(suppress=suppress) self._dlm_logger.info("Forecasting next " + str(n_forecast) + str(self.freq)) # try: if features_dict is not None and len(features_dict) != 0: (predictMean, predictVar) = self._model.predictN(N=n_forecast, date=self._model.n - 1, featureDict=features_dict) else: (predictMean, predictVar) = self._model.predictN(N=n_forecast, date=self._model.n - 1) except (NameError, ValueError) as e: self._dlm_logger.exception("DLM PredictN error...{}".format(e)) sys.exit("STOP") idx_future = self._gen_idx_future(n_forecast=n_forecast) self.forecast = pd.Series(np.asarray(predictMean), index=idx_future) # confidence intervals cl, cu = self.compute_ci(yhat=np.asarray(predictMean), yhat_var=np.asarray(predictVar), ci_level=self._dlm_interval_width) cl = pd.Series(cl, index=idx_future) cu = pd.Series(cu, index=idx_future) self.lower_conf_int = pd.concat([self.lower_conf_int, cl], axis=0) self.upper_conf_int = pd.concat([self.upper_conf_int, cu], axis=0) self.residuals_forecast = None self.plot_forecast(n_forecast=n_forecast, features_dict=features_dict) return self def plot_forecast(self, **kwargs): """Plot forecasted values""" if self.residuals_forecast is not None: fig, axis = super(DLMForecaster, self)._plot_forecast( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues), forecast=self.forecast, _id='DLM') plt.gcf().autofmt_xdate() plt.grid(True) plt.show() else: n_forecast = -1 features_dict = dict() for k, v in kwargs.items(): if k == 'n_forecast': n_forecast = v if k == 'features_dict': features_dict = v print(features_dict) try: if features_dict is not None and len(features_dict) != 0: self._model.plotPredictN(N=n_forecast, date=self._model.n - 1, featureDict=features_dict) else: self._model.plotPredictN(N=n_forecast, date=self._model.n - 1) except (NameError, ValueError) as e: self._dlm_logger.exception( "DLM plotPredictN error...{}".format(e)) sys.exit("STOP")
class ARIMAForecaster(UVariateTimeSeriesClass): """Univariate time series child class for forecasting using ARIMA Attributes ---------- _order: tuple a tuple of p, d, q _arima_trend: str A parameter for controlling a model of the deterministic trend as one of ‘nc’ or ’c’. ‘c’ includes constant trend, ‘nc’ no constant for trend. _arima_logger: Logger the logger Methods ---------- assertions() Assertion tests, must be overrided set_params() Sets new parameter values get_params_dict() Gets parameter values as a dictionary ts_fit() Fits the ARIMA model to time series ts_diagnose() Diagnoses the fitted model plot_residuals() Generates residual plots ts_test() Evaluates fitted model on the test data, if this one has been generated ts_forecast() Forecasts time series and plots the results plot_forecasts() Plots forecasted time-series """ def __init__(self, order=(1, 0, 1), **kwds): """Initializes the object ARIMAForecaster""" self._arima_logger = Logger("ARIMA") self._order = order self._arima_trend = '' try: super(ARIMAForecaster, self).__init__(**kwds) except TypeError as e: self._arima_logger.exception("Arguments missing...{}".format(e)) self._model = None ARIMAForecaster._init_trend(self) self._ar_coef = None self._ma_coef = None ARIMAForecaster.assertions(self) self._id = 'ARIMA' def _init_trend(self): if self._trend == 'constant': self._arima_trend = 'c' elif self._trend is None: self._arima_trend = 'nc' elif self._trend in ['linear', 'constant linear', 'additive', 'add', 'multiplicative', 'mul']: # self._arima_logger.warning("The trend " + str(self._trend) + # " is not supported by ARIMA! Assuming constant trend") self._arima_trend = 'c' def __copy__(self): """Copies the object""" result = super(ARIMAForecaster, self).__copy__() result._order = self._order result._arima_trend = self._arima_trend result._arima_logger = self._arima_logger return result def assertions(self): try: assert isinstance(self._order, tuple) except AssertionError: self._arima_logger.exception("Assertion exception occurred, tuple expected") sys.exit("STOP") try: assert (self.hyper_params is not None and len(self.hyper_params) != 0 and 'trend' in list(self.hyper_params.keys())) or ( self._arima_trend in ['c', 'nc'] or self._arima_trend is None) except AssertionError: self._arima_logger.exception("Assertion Error, trend must be in ['c', 'nc']") sys.exit("STOP") def set_params(self, p_dict=None, **kwargs): """Sets new parameter values""" params_dict = kwargs if p_dict is not None: params_dict = p_dict # for k, v in params_dict.items(): if k == 'ts_df': self.ts_df = v elif k == 'freq': self.freq = v elif k == 'n_test': self.n_test = v elif k == 'n_val': self.n_val = v elif k == 'time_format': self.time_format = v elif k == 'order': self._order = v elif k == 'trend': self._arima_trend = v self.assertions() return self def get_params_dict(self): """Gets parameter values""" return {'order': self._order, 'trend': self._arima_trend } def ts_fit(self, suppress=False): """Fit ARIMA to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ if self.hyper_params is not None: self._gs.set_forecaster(self) self._gs.set_hyper_params(self.hyper_params) # a very important command here to avoid endless loop self.hyper_params = None self._arima_logger.info("***** Starting grid search *****") self._gs = self._gs.grid_search(suppress=suppress, show_plot=False) # self.best_model = self._gs.best_model self.__dict__.update(self.best_model['forecaster'].__dict__) self._arima_logger.info("***** Finished grid search *****") else: self._prepare_fit() self.ts_split() ARIMAForecaster._init_trend(self) ts_df = self._train_dt.copy() # Fit self._arima_logger.info("Trying to fit the ARIMA model....") # tic start = time() try: if not suppress: self._arima_logger.info("...via using parameters\n") print_attributes(self) self._model = ARIMA(ts_df['y'], order=self._order, freq=self.freq) self.model_fit = self._model.fit(trend=self._arima_trend, method='mle', disp=1) except (Exception, ValueError): self._arima_logger.exception("Exception occurred in the fit...") self._arima_logger.error("Please try other parameters!") self.model_fit = None else: # toc self._arima_logger.info("Time elapsed: {} sec.".format(time() - start)) self._arima_logger.info("Model successfully fitted to the data!") if not suppress: self._arima_logger.info("The model summary: " + str(self.model_fit.summary())) # Fitted values self._arima_logger.info("Computing fitted values and residuals...") self._ar_coef, self._ma_coef = self.model_fit.arparams, self.model_fit.maparams self.fittedvalues = self.model_fit.fittedvalues # prologue if len(self.fittedvalues) != len(self._train_dt): self.fittedvalues = pd.DataFrame( index=pd.date_range(ts_df.index[0], ts_df.index[len(ts_df) - 1], freq=self.freq), columns=['dummy']).join(pd.DataFrame(self.fittedvalues)).drop(['dummy'], axis=1) self.fittedvalues = self.fittedvalues.reset_index() self.fittedvalues.columns = self._ts_df_cols self.fittedvalues.set_index('ds', inplace=True) self.fittedvalues.y = self.fittedvalues.y.fillna(method='bfill') # Residuals super(ARIMAForecaster, self)._residuals() self._arima_logger.info("Done.") return self def ts_diagnose(self): """Diagnoses the model. In case of ARIMA residual plots are generated. Additionally, the kde plot of residuals is returned """ try: assert self.model_fit is not None except AssertionError: self._arima_logger.exception("Model has to be fitted first! Please call ts_fit(...)") sys.exit("STOP") self.residuals.plot(kind='kde', title='Density') print("Residuals statistics") print(self.residuals.describe()) self.plot_residuals() def plot_residuals(self): """Plot the residuals""" fig, axis = super(ARIMAForecaster, self)._plot_residuals(y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues).flatten(), _id="ARIMA") plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_test(self, show_plot=True): """Test the fitted model if test data available""" if super(ARIMAForecaster, self)._check_ts_test() < 0: return n_forecast = len(self._test_dt) self._arima_logger.info("Evaluating the fitted ARIMA model on the test data...") future = self.model_fit.predict(start=len(self._train_dt.index), end=len(self._train_dt.index) + n_forecast - 1, dynamic=True) self.forecast = pd.Series(future, index=self._test_dt.index) self.residuals_forecast = pd.Series(np.asarray(self._test_dt['y']) - np.asarray(self.forecast), index=self._test_dt.index) self.measure_rmse() self._arima_logger.info("RMSE on the test data: {}".format(self.rmse)) # plot if show_plot: self.plot_forecast() def ts_forecast(self, n_forecast, suppress=False): """Forecast time series over time frame in the future specified via n_forecast""" # n_forecast = super(ARIMAForecaster, self)._check_ts_forecast(n_forecast) # self._arima_logger.info("Fitting using all data....") self._mode = 'forecast' self.ts_fit(suppress=suppress) self._arima_logger.info("Forecasting next " + str(n_forecast) + str(self.freq)) # future = self.model_fit.predict(start=len(self._train_dt.index), end=len(self._train_dt.index) + (n_forecast-1), dynamic=True) idx_future = self._gen_idx_future(n_forecast=n_forecast) self.forecast = pd.Series(future, index=idx_future) self.residuals_forecast = None # self.plot_forecast() return self def plot_forecast(self): """Plot forecasted values""" fig, axis = super(ARIMAForecaster, self)._plot_forecast(y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues).flatten(), forecast=self.forecast, _id='ARIMA') plt.gcf().autofmt_xdate() plt.grid(True) plt.show()
class UVariateTimeSeriesClass(object): """ Uni-variate time series class Attributes: _ts_df_cols - internal column names for dataframe that will be input to model ts_df - time series data frame freq - frequency of time series, possibilities ['S', 'min', 'H', 'D', 'W', 'M'] p_train - float value defining which part of data is to be used as training data. Note, value of 1.0 would mean all data will be used as training data, hence no test data will be generated. timeformat - time format if time series data needs to be brought into datetime # _mode - defines the mode as 'test' or 'forecast' _train_dt - training data _test_dt - test data model_fit - fitted model fittedvalues - computed fitted values residuals - residuals rmse - RMSE on test set (test data and the forecast on test data) upper_whisker_res - upper whisker for residuals lower_conf_int - upper confidence interval upper_conf_int - lower confidence interval forecast - computed forcatsed values residuals_forecast - residuals between forecasted and real values. Note, this variable exist only if test data existed Methods: ts_transform() - transforms time series using log10 or box-cox ts_resample() - resamples time series at the chosen frequency freq _plot_residuals() - residual plots helper function ts_test() - evaluates fitted model on the test data, if this one has been generated ts_forecast() - forecasts time series and plots the results _plot_forecast() - helper function for plotting forecasted time-series ts_decompose() - decomposes time series in seasonal, trend and resduals and plots the results plot_decompose() - plots the results of ts_decompose() Helper methods: _prepare_fit() - prepares ts_fit of child class. Supposed to be called by a child class _residuals() - helper function for calculating residuals. Supposed to be called by a child class _check_ts_test() - checks for test. Supposed to be called by a child class _check_ts_forecast() - checks for forecast. Supposed to be called by a child class """ def __init__(self, ts_df, time_format="%Y-%m-%d %H:%M:%S", freq='D', p_train=1.0, **kwds): """ Initializes the object UVariateTimeSeriesForecaster """ self._ts_df_cols = ['ds', 'y'] self.ts_df = ts_df self.time_format = time_format self.freq = freq self.p_train = p_train self.transform = None self._boxcox_lmbda = None self._mode = '' self._train_dt = None self._test_dt = None self.model_fit = None self.fittedvalues = None self.residuals = None self.rmse = None self.upper_whisker_res = None self.lower_conf_int = None self.upper_conf_int = None self.forecast = None self.residuals_forecast = None self.seasonal = None self.trend = None self.baseline = None self._uvts_cls_logger = Logger('uvts_cls') # Assertion Tests try: assert self.freq in ['S', 'min', 'H', 'D', 'W', 'M'] except AssertionError: self._uvts_cls_logger.warning("freq should be in ['S', 'min', 'H', 'D', W', 'M']. " "Assuming daily frequency!") self.freq = 'D' try: self.p_train = float(self.p_train) assert self.p_train > 0 except AssertionError: self._uvts_cls_logger.error("p_train defines part of data on which you would train your model." "This value cannot be less than or equal to zero!") self._uvts_cls_logger.exception("Exception occurred, p_train") except ValueError: self._uvts_cls_logger.error("p_train must be convertible to float type!") self._uvts_cls_logger.exception("Exception occurred, p_train") else: if int(self.p_train) < 1: self._mode = 'test' else: self._mode = 'forecast' try: assert pd.DataFrame(self.ts_df).shape[1] <= 2 except AssertionError: self._uvts_cls_logger.error( "Time series must be uni-variate. " "Hence, at most a time columns and a column of numeric values are expected!") self._uvts_cls_logger.exception("Exception occurred, ts_df") else: self.ts_df = self.ts_df.reset_index() self.ts_df.columns = self._ts_df_cols self.ts_df['y'] = self.ts_df['y'].apply(np.float64, errors='coerce') self.ts_df.set_index('ds', inplace=True) print(type(self._uvts_cls_logger)) print(self._uvts_cls_logger) self._uvts_cls_logger.info("Using time series data of range: " + str(min(self.ts_df.index)) + ' - ' + str( max(self.ts_df.index)) + " and shape: " + str(self.ts_df.shape)) if not isinstance(self.ts_df.index, pd.DatetimeIndex): self._uvts_cls_logger.warning("Time conversion required...") self.ts_df = self.ts_df.reset_index() try: self.ts_df['ds'] = self.ts_df['ds'].apply( lambda x: datetime.datetime.strptime( str(x).translate({ord('T'): ' ', ord('Z'): None})[:-1], self.time_format)) except ValueError as e: self._uvts_cls_logger.warning("Zulu time conversion not successful: {}".format(e)) self._uvts_cls_logger.warning("Will try without assuming zulu time...") try: self.ts_df['ds'] = self.ts_df['ds'].apply( lambda x: datetime.datetime.strptime(str(x), self.time_format)) except ValueError as e: self._uvts_cls_logger.info("Time conversion not successful. Check your time_format: {}".format(e)) else: self._uvts_cls_logger.info("Time conversion successful!") else: self._uvts_cls_logger.info("Time conversion successful!") # set index self.ts_df.set_index('ds', inplace=True) # self.ts_df.index = pd.to_datetime(self.ts_df.index) self.ts_df.sort_index(inplace=True) # resample self.ts_resample() # delegate super(UVariateTimeSeriesClass, self).__init__(**kwds) def __copy__(self): """ Copies the object """ cls = self.__class__ result = cls.__new__(cls) result.__dict__.update(self.__dict__) return result def __deepcopy__(self, memo): """ Deepcopies the object """ cls = self.__class__ result = cls.__new__(cls) memo[id(self)] = result for k, v in self.__dict__.items(): setattr(result, k, deepcopy(v, memo)) return result def ts_transform(self, transform): """ Transforms time series via applying casted 'transform'. Right now 'log10' and 'box-cox' possible. """ try: assert transform.lower().strip() in ['log10', 'box-cox'] except AssertionError: self._uvts_cls_logger.error( "transform should be in ['log10', 'box-cox'] or empty. Assuming no transform! " "Hence, if you get bad results, you would like maybe to choose e.g., log10 here.") self._uvts_cls_logger.exception("Assertion exception occurred, transform") self.transform = None else: self.transform = transform.lower() # transform if self.transform == 'log10': try: self.ts_df['y'] = self.ts_df['y'].apply(np.log10) except ValueError: self._uvts_cls_logger.exception("log10 transformation did not work! Possibly negative " "values present?") elif self.transform == 'box-cox': if input("Do you want to provide lambda for box.cox? y/n?").strip().lower() == 'y': self._boxcox_lmbda = float(input()) else: self._boxcox_lmbda = None try: if self._boxcox_lmbda is None: bc, lmbda_1 = stats.boxcox(self.ts_df['y'], lmbda=self._boxcox_lmbda) self.ts_df['y'] = stats.boxcox(self.ts_df['y'], lmbda=lmbda_1) else: self.ts_df['y'] = stats.boxcox(self.ts_df['y'], lmbda=self._boxcox_lmbda) except ValueError: self._uvts_cls_logger.exception("box-cox transformation did not work! " "Possibly negative values present or bad lmbda?") return self def set_frequency(self, new_freq): """ Sets new frequency and resamples time series to that new frequency """ try: assert new_freq in ['S', 'min', 'H', 'D', 'W', 'M'] except AssertionError: self._uvts_cls_logger.error("frequency should be in ['S', 'min', 'H', 'D', W', 'M']") else: self.freq = new_freq self.ts_resample() def ts_check_frequency(self): """ Checks the frequency of time series """ if self.ts_df.index.freq is None: self._uvts_cls_logger.info("No specific frequency detected.") self._uvts_cls_logger.info("Frequency chosen in initialization: " + str( self.freq) + " enter 'n' and call ts_resample() if you are satisfied with this value.") if input("Should a histogram of time deltas be plotted y/n?").strip().lower() == 'y': ff = pd.Series(self.ts_df.index[1:(len(self.ts_df))] - self.ts_df.index[0:(len(self.ts_df) - 1)]) ff = ff.apply(lambda x: int(x.total_seconds() / (60 * 60))) plt.hist(ff, bins=120) plt.xlabel("Rounded time delta [H]") plt.ylabel("Frequency of occurrence") self._uvts_cls_logger.info(ff.value_counts()) self._uvts_cls_logger.info("Should hourly frequency not fit, choose a reasonable frequency and call " "set_frequency(new_freq)") else: pass else: self._uvts_cls_logger.info("Time series frequency: " + str(self.ts_df.index.freq)) def ts_resample(self): """ Brings original time series to the chosen frequency freq """ ts_freq = pd.DataFrame( index=pd.date_range(self.ts_df.index[0], self.ts_df.index[len(self.ts_df) - 1], freq=self.freq), columns=['dummy']) self.ts_df = ts_freq.join(self.ts_df).drop(['dummy'], axis=1) self.ts_df.y = self.ts_df.y.fillna(method='ffill') # if np.isnan ( self.ts_df.y ).any (): # self.ts_df.y = self.ts_df.y.fillna ( method='bfill' ) if np.isnan(self.ts_df.y).any(): self._uvts_cls_logger.warning("Some NaN found, something went wrong, check the data!") sys.exit(-1) self._uvts_cls_logger.info("Time series resampled at frequency: " + str(self.ts_df.index.freq) + ". New shape of the data: " + str(self.ts_df.shape)) return self def _prepare_fit(self): """ Prepares data for training or forecasting modes """ if self.ts_df.index.freq is None: self._uvts_cls_logger.warning("Time series exhibit no frequency. Calling ts_resample()...") try: self.ts_resample() except ValueError: self._uvts_cls_logger.error("Resample did not work! Error:" + str(sys.exc_info()[0])) sys.exit("STOP") ts_df = self.ts_df ts_test_df = pd.DataFrame() if self._mode == 'forecast' or int(self.p_train) == 1: self._train_dt = ts_df self._test_dt = ts_test_df elif self._mode == 'test' and int(self.p_train) < 1: # split ts_df = ts_df.reset_index() ts_df.columns = self._ts_df_cols ts_test_df = ts_df # training ts_df = pd.DataFrame(ts_df.loc[:int(self.p_train * len(ts_df) - 1), ]) ts_df.set_index('ds', inplace=True) # test ts_test_df = pd.DataFrame(ts_test_df.loc[int(self.p_train * len(ts_test_df)):, ]) ts_test_df.set_index('ds', inplace=True) # now set self._train_dt = ts_df if not ts_test_df.empty: self._test_dt = ts_test_df return self def _residuals(self): """ Calculate residuals """ if self.model_fit is None: self._uvts_cls_logger.error("No model has been fitted, residuals cannot be computed!") sys.exit("STOP") try: # use fittedvalues to fill in the model dictionary self.residuals = pd.Series(np.asarray(self._train_dt['y']) - np.asarray(self.fittedvalues).flatten(), index=self._train_dt['y'].index) self.upper_whisker_res = self.residuals.mean() + 1.5 * ( self.residuals.quantile(0.75) - self.residuals.quantile(0.25)) except (KeyError, AttributeError): self._uvts_cls_logger.exception("Exception occurred: Model was not fitted or ts has other structure") return self def _plot_residuals(self, y, yhat, _id): """ Plot the residuals """ try: assert self.model_fit is not None except AssertionError: self._uvts_cls_logger.exception("Model has to be fitted first! Please call ts_fit(...)") fig, axes = plt.subplots(2, 1, figsize=(20, 5), sharex=True) axes[0].plot(pd.Series(yhat, index=self._train_dt.index), color='y', linewidth=2.0) axes[0].plot(pd.Series(y, index=self._train_dt.index), color='b') axes[0].set_ylabel("Model Fit") axes[0].set_title("Real (blue) and estimated values, " + str(_id)) # axes[1].plot(self.residuals, color="r") if self.forecast is not None and self.residuals_forecast is None \ and self.lower_conf_int is not None and self.upper_conf_int is not None: axes[0].fill_between(self.lower_conf_int.index, self.lower_conf_int, self.upper_conf_int, color='k', alpha=.15) if self.upper_whisker_res is not None: axes[1].axhline(y=self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[1].axhline(y=-self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[1].set_ylabel('Residuals') axes[1].set_title('Difference between model output and the real data and +/- upper whisker, ' + str(_id)) return fig, axes def _check_ts_test(self): """ Check before ts_test is child class is called """ try: assert self.model_fit is not None except AssertionError: self._uvts_cls_logger.exception("Model has to be fitted first! Please call ts_fit(...)") try: assert self._test_dt is not None except(KeyError, AssertionError): self._uvts_cls_logger.exception("Nothing to validate. " "Call ts_forecast() or specify amount of training data " "when initializing the object.") return -1 else: self._mode = 'test' return 0 def _check_ts_forecast(self, n_forecast): """ Check before ts_forecast in child class is called """ # try: n_forecast = int(n_forecast) assert 0 < n_forecast < len(self._train_dt) except AssertionError: self._uvts_cls_logger.exception("Number of periods to be forecasted is too low, too high or not numeric!") except ValueError: self._uvts_cls_logger.exception("n_forecast must be convertible to int type!") try: assert self.model_fit is not None except AssertionError: self._uvts_cls_logger.exception("Model has to be fitted first! Please call ts_fit(...)") return n_forecast def _gen_idx_future(self, n_forecast): idx_future = None if self.freq == 'S': idx_future = pd.date_range(start=max(self._train_dt.index), end=max(self._train_dt.index) + datetime.timedelta( seconds=n_forecast - 1), freq='S') elif self.freq == 'min': idx_future = pd.date_range(start=max(self._train_dt.index), end=max(self._train_dt.index) + datetime.timedelta( minutes=n_forecast - 1), freq='min') elif self.freq == 'H': idx_future = pd.date_range(start=max(self._train_dt.index), end=max(self._train_dt.index) + datetime.timedelta( hours=n_forecast - 1), freq='H') elif self.freq == 'D': idx_future = pd.date_range(start=max(self._train_dt.index), end=max(self._train_dt.index) + datetime.timedelta( days=n_forecast - 1), freq='D') elif self.freq == 'W': idx_future = pd.date_range(start=max(self._train_dt.index), end=max(self._train_dt.index) + datetime.timedelta( weeks=n_forecast - 1), freq='W') elif self.freq == 'M': idx_future = pd.date_range(start=max(self._train_dt.index), end=max(self._train_dt.index) + relativedelta(months=+(n_forecast - 1)), freq='M') return idx_future def _plot_forecast(self, y, yhat, forecast, _id): """ Plot forecasted values """ try: assert self.model_fit is not None except AssertionError: self._uvts_cls_logger.exception("Model has to be fitted first! Please call ts_fit(...)") sys.exit("STOP") # try: assert self.forecast is not None except AssertionError: self._uvts_cls_logger.exception("Neither ts_test(...) nor ts_forecast(...) have been called yet!") sys.exit("STOP") fig, axes = plt.subplots(2, 1, figsize=(20, 7), sharex=True) # axes[0].plot(pd.Series(yhat, index=self._train_dt.index), color='y', linewidth=2.0) axes[0].plot(pd.Series(y, index=self._train_dt.index), color='b') # if self.residuals_forecast is not None: axes[0].plot(self.ts_df, color='b') axes[0].plot(forecast, color='darkgreen') # if self.lower_conf_int is not None and self.upper_conf_int is not None: axes[0].fill_between(self.lower_conf_int.index, self.lower_conf_int, self.upper_conf_int, color='k', alpha=.15) axes[0].set_ylabel("Fit and Forecast/Validation") axes[0].set_title("Real (blue), estimated (yellow) and forecasted values, " + str(_id)) # if self.residuals_forecast is not None: axes[1].plot(pd.concat([self.residuals, self.residuals_forecast], axis=0), color='r') axes[1].plot(self.residuals, color="r") if self.upper_whisker_res is not None: axes[1].axhline(y=self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[1].axhline(y=-self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[1].set_ylabel("Residuals") axes[1].set_title("Difference between model output and the real data both, for fitted " "and forecasted and +/- upper whisker or confidence intervals, " + str(_id)) return fig, axes def ts_decompose(self, params=None): """ Decomposes time series into trend, seasonal and residual """ if params is None: params = dict({'model': 'additive', 'freq': 1}) try: assert isinstance(params, dict) except AssertionError: self._uvts_cls_logger.exception("Dictionary is expected for parameters!") sys.exit("STOP") try: assert 'model' in list(params.keys()) except AssertionError: self._uvts_cls_logger.exception("Unexpected dictionary keys. At least decomposition " "model must be supplied!") sys.exit("STOP") if 'freq' not in list(params.keys()): params['freq'] = 1 try: if self.ts_df.index.freq is not None: res = seasonal_decompose(self.ts_df.loc[:, 'y'], model=params['model']) else: res = seasonal_decompose(self.ts_df.loc[:, 'y'], model=params['model'], freq=params['freq']) except ValueError: self._uvts_cls_logger.exception("ValueError, seasonal_decompose error") else: self.seasonal = res.seasonal self.trend = res.trend self.baseline = self.seasonal + self.trend self.residuals = res.resid self.upper_whisker_res = self.residuals.mean() + 1.5 * ( self.residuals.quantile(0.75) - self.residuals.quantile(0.25)) def plot_decompose(self): try: assert self.seasonal is not None except AssertionError: self.ts_decompose() fig, axes = plt.subplots(4, 1, figsize=(20, 7), sharex=True) axes[0].plot(self.trend) axes[0].set_title("Trend") # axes[1].plot(self.seasonal) axes[1].set_title("Seasonality") # axes[2].plot(self.baseline) axes[2].set_title("Baseline") # axes[3].plot(self.residuals) axes[3].set_title("Residuals") # if self.upper_whisker_res is not None: axes[3].axhline(y=self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[3].axhline(y=-self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_fit(self): # stop the delegation chain assert not hasattr(super(), 'ts_fit') # root mean squared error or rmse def measure_rmse(self): try: assert self.residuals_forecast is not None except AssertionError: self._uvts_cls_logger.exception("AssertionError occurred, Cannot compute RMSE! Check your object mode") self.rmse = np.sqrt(np.square(self.residuals_forecast).mean())
class AutoARIMAForecaster(UVariateTimeSeriesClass): """Univariate time series child class using pmdarima.auto_arima for forecasting Attributes ---------- ref. to https://pypi.org/project/pmdarima/ https://www.alkaline-ml.com/pmdarima/modules/generated/pmdarima.arima.AutoARIMA.html#pmdarima.arima.AutoARIMA _start_p: int The starting value for p _start_q: int The starting value for q _test: str Test for determining the value of d _max_p: int The maximal value for p: all values between _start_p and this one will be tried out _max_q: int The maximal value for q: all values between _start_q and this one will be tried out _d: int The maximum value of d, or the maximum number of non-seasonal differences. If None, this value will be determined. _seasonal: bool Seasonal component yes/no _D: The order of the seasonal differencing. If None, the value will automatically be selected based on the results of the seasonal_test. _start_P: int The starting value for P _start_Q: int The starting value for Q _max_P: int The maximum value for P _max_Q: int The maximum value for Q _seasonal_periods (m in original package): int The period for seasonal differencing, m refers to the number of periods in each season. For example, m is 4 for quarterly data, 12 for monthly data, or 1 for annual (non-seasonal) data. Default is 1. Note that if m == 1 (i.e., is non-seasonal), seasonal will be set to False. _aarima_trend: str or iterable, default=’c’, ref. http://www.alkaline-ml.com/pmdarima/1.0.0/modules/generated/pmdarima.arima.auto_arima.html Parameter controlling the deterministic trend polynomial A(t). Can be specified as a string where ‘c’ indicates a constant (i.e. a degree zero component of the trend polynomial), ‘t’ indicates a linear trend with time, and ‘ct’ is both. Can also be specified as an iterable defining the polynomial as in numpy.poly1d, where [1,1,0,1] would denote a+bt+ct3. _random : bool, optional (default=False) Auto_arima provides the capability to perform a “random search” over a hyper-parameter space. If random is True, rather than perform an exhaustive search or stepwise search, only n_fits ARIMA models will be fit (stepwise must be False for this option to do anything). _n_fits : int, optional (default=10) If random is True and a “random search” is going to be performed, n_iter is the number of ARIMA models to be fit. _stepwise : bool, optional (default=True) Whether to use the stepwise algorithm outlined in Hyndman and Khandakar (2008) to identify the optimal model parameters. The stepwise algorithm can be significantly faster than fitting all (or a random subset of) hyper-parameter combinations and is less likely to over-fit the model. _information_criterion : str, optional (default=’aic’) The information criterion used to select the best ARIMA model. One of pmdarima.arima.auto_arima.VALID_CRITERIA, (‘aic’, ‘bic’, ‘hqic’, ‘oob’). _scoring : str, optional (default=’mse’) If performing validation (i.e., if out_of_sample_size > 0), the metric to use for scoring the out-of-sample data. One of {‘mse’, ‘mae’} _out_of_sample_size : int, optional (default=0) The ARIMA class can fit only a portion of the data if specified, in order to retain an “out of bag” sample score. This is the number of examples from the tail of the time series to hold out and use as validation examples. The model will not be fit on these samples, but the observations will be added into the model’s endog and exog arrays so that future forecast values originate from the end of the endogenous vector. _aarima_logger: Logger The logger for logging Methods ---------- assertions() Assertion tests, must be overrided set_params() Sets new parameter values get_params_dict() Gets parameter values as a dictionary ts_fit() Fits the auto_arima model to time series ts_diagnose() Diagnoses the fitted model plot_residuals() Generates residual plots ts_test() Evaluates fitted model on the test data, if this one has been generated ts_forecast() Forecasts time series and plots the results plot_forecasts() Plots forecasted time-series """ def __init__(self, start_p=1, start_q=1, max_p=3, max_q=3, d=None, D=None, start_P=1, start_Q=1, max_P=3, max_Q=3, random=False, n_fits=10, stepwise=True, information_criterion='aic', scoring='mse', out_of_sample_size=0, **kwds): """Initializes the object AutoARIMAForecaster""" self._aarima_logger = Logger("AutoARIMA") self._aarima_seasonal = False self._aarima_trend = 'c' self._start_p = start_p self._start_q = start_q self._max_p = max_p self._max_q = max_q self._d = d self._D = D self._start_P = start_P self._start_Q = start_Q self._max_P = max_P self._max_Q = max_Q self._random = random self._n_fits = n_fits self._stepwise = stepwise self._information_criterion = information_criterion self._scoring = scoring self._out_of_sample_size = out_of_sample_size try: super(AutoARIMAForecaster, self).__init__(**kwds) except TypeError: self._aarima_logger.exception("Arguments missing...") AutoARIMAForecaster._init_trend(self) AutoARIMAForecaster._init_seasonal(self) AutoARIMAForecaster.assertions(self) self._id = 'Auto_ARIMA' def _init_trend(self): if self._trend is None or self._trend == 'constant': self._aarima_trend = 'c' elif self._trend == 'linear': self._aarima_trend = 't' elif self._trend == 'constant linear': self._aarima_trend = 'ct' elif self._trend in ['additive', 'add']: # self._aarima_logger.warning("The trend " + str(self._trend) + " not supported by AutoARIMA! " # "Assuming first order trend") self._aarima_trend = 'a+bt' elif self._trend in ['multiplicative', 'mul']: # self._aarima_logger.warning("The trend " + str(self._trend) + " not supported by AutoARIMA! " # "Assuming first order trend") self._aarima_trend = 'a+bt' def _init_seasonal(self): if self._seasonal is None: self._aarima_seasonal = False if isinstance(self._seasonal, bool): self._aarima_seasonal = self._seasonal else: self._aarima_seasonal = False def __copy__(self): """Copies the object""" result = super(AutoARIMAForecaster, self).__copy__() result._start_p = self._start_p result.start_q = self._start_q result._test = self._test result._max_p = self._max_p result._max_q = self._max_q result._d = self._d result._aarima_trend = self._aarima_trend result._aarima_seasonal = self._aarima_seasonal result._D = self._D result._start_P = self._start_P result._start_Q = self._start_Q result._max_P = self._max_P result._max_Q = self._max_Q result._random = self._random result._n_fits = self._n_fits result._stepwise = self._stepwise result._information_criterion = self._information_criterion result._scoring = self._scoring result._out_of_sample_size = self._out_of_sample_size result._aarima_logger = self._aarima_logger return result def assertions(self): try: assert self.hyper_params is None except AssertionError: self._aarima_logger.exception( "Hyper parameters does not make sence for Auto ARIMA! " "Please specify parameters") sys.exit("STOP") try: assert self._aarima_trend is not None except AssertionError: self._aarima_logger.exception( "Assertion Error, trend cannot be None!") sys.exit("STOP") try: assert isinstance(self._aarima_seasonal, bool) except AssertionError: self._aarima_logger.exception( "Assertion Error, seasonal must be boolean True/False") sys.exit("STOP") def set_params(self, p_dict=None, **kwargs): """Sets new parameter values""" params_dict = kwargs if p_dict is not None: params_dict = p_dict # for k, v in params_dict.items(): if k == 'ts_df': self.ts_df = v elif k == 'freq': self.freq = v elif k == 'n_test': self.n_test = v elif k == 'n_val': self.n_val = v elif k == 'time_format': self.time_format = v elif k == 'start_p': self._start_p = v elif k == 'max_p': self._max_p = v elif k == 'start_q': self._start_q = v elif k == 'max_q': self._max_q = v elif k == 'd': self._d = v elif k == 'trend': self._aarima_trend = v elif k == 'seasonal': self._aarima_seasonal = v elif k == 'seasonal_periods': self._seasonal_periods = v elif k == 'start_P': self._start_P = v elif k == 'max_P': self._max_P = v elif k == 'start_Q': self._start_Q = v elif k == 'max_Q': self._max_Q = v elif k == 'D': self._D = v elif k == 'random': self._random = v elif k == 'n_fits': self._n_fits = v elif k == 'stepwise': self._stepwise = v elif k == 'information_criterion': self._information_criterion = v elif k == 'scoring': self._scoring = v elif k == 'out_of_sample_size': self._out_of_sample_size = v self.assertions() return self def get_params_dict(self): """Gets parameter values as dictionary""" return { 'start_p': self._start_p, 'start_q': self._start_q, 'test': self._test, 'max_p': self._max_p, 'max_q': self._max_q, 'd': self._d, 'trend': self._aarima_trend, 'seasonal': self._aarima_seasonal, 'seasonal_periods': self._seasonal_periods, 'D': self._D, 'start_P': self._start_P, 'start_Q': self._start_Q, 'max_P': self._max_P, 'max_Q': self._max_Q, 'random': self._random, 'n_fits': self._n_fits, 'stepwise': self._stepwise, 'information_criterion': self._information_criterion, 'scoring': self._scoring, 'out_of_sample_size': self._out_of_sample_size } def ts_fit(self, suppress=False): """Fit Auto ARIMA to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ self._prepare_fit() self.ts_split() self._init_trend() self._init_seasonal() ts_df = self._train_dt.copy() """ Fit """ self._aarima_logger.info("Trying to fit the Auto ARIMA model....") # tic start = time() try: if not suppress: self._aarima_logger.info("...via using parameters\n") print_attributes(self) self.model_fit = pm.auto_arima( ts_df, start_p=self._start_p, start_q=self._start_q, test=self._test, max_p=self._max_p, m=self._seasonal_periods, d=self._d, seasonal=self._aarima_seasonal, D=self._D, start_P=self._start_P, max_P=self._max_P, trend=self._aarima_trend, trace=True, error_action='ignore', suppress_warnings=True, stepwise=self._stepwise, random=self._random, n_fits=self._n_fits, scoring=self._scoring, out_of_sample_size=self._out_of_sample_size, information_criterion=self._information_criterion) except (Exception, ValueError): self._aarima_logger.exception("Exception occurred in the fit...") self._aarima_logger.warning("Will try to reset some parameters...") try: self.model_fit = pm.auto_arima( ts_df, start_p=self._start_p, start_q=self._start_q, test=self._test, max_p=self._max_p, m=1, d=0, seasonal=self._aarima_seasonal, D=0, start_P=self._start_P, max_P=self._max_P, trend=self._aarima_trend, trace=True, error_action='ignore', suppress_warnings=True, stepwise=self._stepwise, random=self._random, n_fits=self._n_fits, scoring=self._scoring, out_of_sample_size=self._out_of_sample_size, information_criterion=self._information_criterion) except (Exception, ValueError): self._aarima_logger.exception("Exception occurred") self._aarima_logger.error("Please try other parameters!") self.model_fit = None else: # toc self._aarima_logger.info("Time elapsed: {} sec.".format(time() - start)) # self._aarima_logger.info("Model successfully fitted to the data!") self._aarima_logger.info("The chosen model AIC: " + str(self.model_fit.aic())) # Fitted values self._aarima_logger.info( "Computing fitted values and residuals...") self.fittedvalues = pd.Series(self.model_fit.predict_in_sample( start=0, end=(len(ts_df) - 1)), index=ts_df.index) # Residuals super(AutoARIMAForecaster, self)._residuals() self._aarima_logger.info("Done.") return self def ts_diagnose(self): """Diagnose the model""" try: assert self.model_fit is not None except AssertionError: self._aarima_logger.exception( "Model has to be fitted first! Please call ts_fit(...)") sys.exit("STOP") self.model_fit.plot_diagnostics(figsize=(9, 3.5)) self.plot_residuals() def plot_residuals(self): """Plot the residuals""" fig, axis = super(AutoARIMAForecaster, self)._plot_residuals( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues), _id=" Auto ARIMA") plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_test(self, show_plot=True): """Test the fitted model if test data available""" if super(AutoARIMAForecaster, self)._check_ts_test() < 0: return n_forecast = len(self._test_dt) self._aarima_logger.info( "Evaluating the fitted ARIMA model on the test data...") future, confint = self.model_fit.predict(n_periods=n_forecast, return_conf_int=True) self.forecast = pd.Series(future, index=self._test_dt.index) self.lower_conf_int = pd.Series(confint[:, 0], index=self._test_dt.index) self.upper_conf_int = pd.Series(confint[:, 1], index=self._test_dt.index) self.residuals_forecast = pd.Series(np.asarray(self._test_dt['y']) - np.asarray(self.forecast), index=self._test_dt.index) self.measure_rmse() self._aarima_logger.info("RMSE on test data: {}".format(self.rmse)) # plot if show_plot: self.plot_forecast() def ts_forecast(self, n_forecast, suppress=False): """Forecast time series over time frame in the future specified via n_forecast""" # n_forecast = super(AutoARIMAForecaster, self)._check_ts_forecast(n_forecast) # self._aarima_logger.info("Fitting using all data....") self._mode = 'forecast' self.ts_fit(suppress=suppress) self._aarima_logger.info("Forecasting next " + str(n_forecast) + str(self.ts_df.index.freq)) # future, confint = self.model_fit.predict(n_periods=n_forecast, return_conf_int=True) idx_future = self._gen_idx_future(n_forecast=n_forecast) self.forecast = pd.Series(future, index=idx_future) if self.lower_conf_int is None and self.upper_conf_int is None: self.lower_conf_int = pd.Series(confint[:, 0], index=idx_future) self.upper_conf_int = pd.Series(confint[:, 1], index=idx_future) else: self.lower_conf_int = pd.concat([ self.lower_conf_int, pd.Series(confint[:, 0], index=idx_future) ], axis=0) self.upper_conf_int = pd.concat([ self.upper_conf_int, pd.Series(confint[:, 1], index=idx_future) ], axis=0) self.residuals_forecast = None # self.plot_forecast() return self def plot_forecast(self): """Plot forecasted values""" fig, axis = super(AutoARIMAForecaster, self)._plot_forecast( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues), forecast=self.forecast, _id='Auto ARIMA') plt.gcf().autofmt_xdate() plt.grid(True) plt.show()
class ExponentialSmoothingForecaster(UVariateTimeSeriesClass): """Univariate time series child class using simple, double or triple exponential smoothing for forecasting Attributes ---------- ref. to e.g., https://machinelearningmastery.com/exponential-smoothing-for-time-series-forecasting-in-python/ _optimized: bool Whether to optimize smoothing coefficients _smoothing_level: float (alpha): the smoothing coefficient for the level _es_trend: str The type of trend component, as either “add” for additive or “mul” for multiplicative. Modeling the trend can be disabled by setting it to None _damped: bool Whether or not the trend component should be damped, either True or False _es_seasonal: str The type of seasonal component, as either “add” for additive or “mul” for multiplicative. Modeling the seasonal component can be disabled by setting it to None _seasonal_periods: int The number of time steps in a seasonal period, e.g. 12 for 12 months in a yearly seasonal structure _smoothing_slope: float (beta): the smoothing coefficient for the trend _smoothing_seasonal: float (gamma): the smoothing coefficient for the seasonal component _damping_slope: float (phi): the coefficient for the damped trend _use_boxcox: {True, False, ‘log’, float} Should the Box-Cox transform be applied to the data first? If ‘log’ then apply the log. If float then use lambda equal to float _remove_bias: bool Remove bias from forecast values and fitted values by enforcing that the average residual is equal to zero. _use_brute: bool Search for good starting values using a brute force (grid) optimizer. If False, a naive set of starting values is used. _expsm_logger: Logger The logger for logging Methods ---------- assertions() Assertion tests, must be overrided set_params() Sets new parameter values get_params_dict() Gets parameter values as a dictionary ts_fit() Fits the auto_arima model to time series ts_diagnose() Diagnoses the fitted model plot_residuals() Generates residual plots ts_test() Evaluates fitted model on the test data, if this one has been generated ts_forecast() Forecasts time series and plots the results plot_forecasts() Plots forecasted time-series """ def __init__(self, smoothing_level=None, optimized=False, damped=False, smoothing_slope=None, smoothing_seasonal=None, damping_slope=None, use_boxcox=False, remove_bias=False, use_brute=False, **kwds): """Initializes the object ExponentialSmoothingForecaster""" self._expsm_logger = Logger("ExpSmoothing") self._es_trend = None self._es_seasonal = None try: super(ExponentialSmoothingForecaster, self).__init__(**kwds) except TypeError: self._expsm_logger.exception("Arguments missing...") self._init_trend() self._init_seasonal() self._smoothing_level = smoothing_level self._optimized = optimized self._damped = damped self._smoothing_slope = smoothing_slope self._smoothing_seasonal = smoothing_seasonal self._damping_slope = damping_slope self._use_boxcox = use_boxcox self._remove_bias = remove_bias self._use_brute = use_brute self.assertions() self._id = 'ExponentialSmoothing' def _init_trend(self): if self._trend is None or self._trend == 'constant': self._es_trend = None elif self._trend in ['linear', 'constant linear']: # self._expsm_logger.warning("The trend " + self(self._trend) + " not supported in Exponential Smoothing! " # "Assuming additive trend") self._es_trend = 'add' else: self._es_trend = self._trend def _init_seasonal(self): if isinstance(self._seasonal, bool): if self._seasonal: # self._expsm_logger.warning("Assuming additive seasonal component in Exponential Smoothing") self._es_seasonal = 'add' else: self._es_seasonal = None else: self._es_seasonal = self._seasonal def __copy__(self): """Copies the object""" result = super(ExponentialSmoothingForecaster, self).__copy__() result._smoothing_level = self._smoothing_level result._optimized = self._optimized result._es_trend = self._es_trend result._es_seasonal = self._es_seasonal result._damped = self._damped result._smoothing_slope = self._smoothing_slope result._smoothing_seasonal = self._smoothing_seasonal result._damping_slope = self._damping_slope result._use_boxcox = self._use_boxcox result._remove_bias = self._remove_bias result._use_brute = self._use_brute result._expsm_logger = self._expsm_logger return result def assertions(self): try: assert (self.hyper_params is not None and len(self.hyper_params) != 0 and 'trend' in list(self.hyper_params.keys())) or ( self._es_trend is None or self._es_trend in ['add', 'mul', 'additive', 'multiplicative']) except AssertionError: self._expsm_logger.exception( "Assertion Error, trend must be in ['add','mul'," "'additive','multiplicative']") sys.exit("STOP") try: assert self._es_seasonal is None or isinstance( self._es_seasonal, str) and self._es_seasonal in [ 'add', 'mul', 'additive', 'multiplicative' ] except AssertionError: self._expsm_logger.exception( "Assertion Error, seasonal must be in ['add','mul'," "'additive','multiplicative']") sys.exit("STOP") def set_params(self, p_dict=None, **kwargs): """Sets new parameters""" params_dict = kwargs if p_dict is not None: params_dict = p_dict # for k, v in params_dict.items(): if k == 'smoothing_level': self._smoothing_level = v elif k == 'optimized': self._optimized = v elif k == 'trend': self._es_trend = v elif k == 'seasonal': self._es_seasonal = v elif k == 'seasonal_periods': self._seasonal_periods = v elif k == 'damped': self._damped = v elif k == 'smoothing_slope': self._smoothing_slope = v elif k == 'smoothing_seasonal': self._smoothing_seasonal = v elif k == 'damping_slope': self._damping_slope = v elif k == 'use_boxcox': self._use_boxcox = v elif k == 'remove_bias': self._remove_bias = v elif k == 'use_brute': self._use_brute = v self.assertions() return self def get_params_dict(self): """Gets parameters as dictionary""" return { 'smoothing_level': self._smoothing_level, 'optimized': self._optimized, 'trend': self._es_trend, 'seasonal': self._es_seasonal, 'seasonal_periods': self._seasonal_periods, 'damped': self._damped, 'smoothing_slope': self._smoothing_slope, 'smoothing_seasonal': self._smoothing_seasonal, 'damping_slope': self._damping_slope, 'use_boxcox': self._use_boxcox, 'remove_bias': self._remove_bias, 'use_brute': self._use_brute } def ts_fit(self, suppress=False): """Fit Exponential Smoothing to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ if self.hyper_params is not None: self._gs.set_forecaster(self) self._gs.set_hyper_params(self.hyper_params) # a very important command here to avoid endless loop self.hyper_params = None self._expsm_logger.info("***** Starting grid search *****") self._gs = self._gs.grid_search(suppress=suppress, show_plot=False) # self.best_model = self._gs.best_model self.__dict__.update(self.best_model['forecaster'].__dict__) self._expsm_logger.info("***** Finished grid search *****") else: self._prepare_fit() self.ts_split() self._init_trend() self._init_seasonal() ts_df = self._train_dt.copy() # Fit print("Trying to fit the exponential smoothing model....") # tic start = time() try: if not suppress: self._expsm_logger.info("...via using parameters\n") print_attributes(self) # self.model_fit = ExponentialSmoothing( ts_df, freq=self.freq, trend=self._es_trend, seasonal=self._es_seasonal, seasonal_periods=self._seasonal_periods, damped=self._damped).fit( smoothing_level=self._smoothing_level, smoothing_slope=self._smoothing_slope, smoothing_seasonal=self._smoothing_seasonal, damping_slope=self._damping_slope, optimized=self._optimized, use_boxcox=self._use_boxcox, remove_bias=self._remove_bias) # toc self._expsm_logger.info("Time elapsed: {} sec.".format(time() - start)) except (Exception, ValueError): self._expsm_logger.exception("Exponential Smoothing error...") else: # self._expsm_logger.info( "Model successfully fitted to the data!") # Fitted values self._expsm_logger.info( "Computing fitted values and residuals...") self.fittedvalues = self.model_fit.fittedvalues # Residuals super(ExponentialSmoothingForecaster, self)._residuals() self._expsm_logger.info("Done.") return self def ts_diagnose(self): """Diagnose the model""" try: assert self.model_fit is not None except AssertionError: self._expsm_logger.exception( "Model has to be fitted first! Please call ts_fit(...)") sys.exit("STOP") self.plot_residuals() def plot_residuals(self): """Plot the residuals""" fig, axis = super(ExponentialSmoothingForecaster, self)._plot_residuals( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues), _id="Exponential Smoothing") plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_test(self, show_plot=True): """Test the fitted model if test data available""" if super(ExponentialSmoothingForecaster, self)._check_ts_test() < 0: return n_forecast = len(self._test_dt) self._expsm_logger.info( "Evaluating the fitted model on the test data...") self.forecast = self.model_fit.forecast(n_forecast) self.residuals_forecast = pd.Series(np.asarray(self._test_dt['y']) - np.asarray(self.forecast), index=self._test_dt.index) self.measure_rmse() self._expsm_logger.info("RMSE on test data: {}".format(self.rmse)) # plot if show_plot: self.plot_forecast() def ts_forecast(self, n_forecast, suppress): """Forecast time series over time frame in the future specified via n_forecast""" # n_forecast = super(ExponentialSmoothingForecaster, self)._check_ts_forecast(n_forecast) # self._expsm_logger.info("Fitting using all data....") self._mode = 'forecast' self.ts_fit(suppress=suppress) self._expsm_logger.info("Forecasting next " + str(n_forecast) + str(self.freq)) # self.forecast = self.model_fit.forecast(n_forecast) self.residuals_forecast = None # self.plot_forecast() return self def plot_forecast(self): """Plot forecasted values""" fig, axis = super(ExponentialSmoothingForecaster, self)._plot_forecast( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues), forecast=self.forecast, _id='Exponential Smoothing') plt.gcf().autofmt_xdate() plt.grid(True) plt.show()
class SARIMAForecaster(ARIMAForecaster): """Univariate time series child class for forecasting using SARIMA Attributes ---------- _order: tuple a tuple of p, d, q _s_order: tuple A tuple of seasonal components (P, D, Q, lag) _sarima_logger: Logger The logger for logging _sarima_trend: str A parameter for controlling a model of the deterministic trend as one of ‘n’,’c’,’t’,’ct’ for no trend, constant, linear, and constant with linear trend, respectively. Methods ---------- assertions() Assertion tests, must be overrided set_params() Sets new parameter values get_params_dict() Gets parameter values as a dictionary ts_fit() Fits the auto_arima model to time series ts_diagnose() Diagnoses the fitted model plot_residuals() Generates residual plots ts_test() Evaluates fitted model on the test data, if this one has been generated ts_forecast() Forecasts time series and plots the results plot_forecasts() Plots forecasted time-series """ def __init__(self, s_order=(1, 0, 1, 1), **kwds): """Initializes the object SARIMAForecaster""" self._sarima_logger = Logger("SARIMA") self._s_order = s_order self._sarima_trend = '' try: super(SARIMAForecaster, self).__init__(**kwds) except TypeError as e: self._sarima_logger.exception("Arguments missing...{}".format(e)) self._model = None self._init_trend() self.assertions() self._id = 'SARIMA' def _init_trend(self): if self._trend == 'constant': self._sarima_trend = 'c' elif self._trend is None: self._srima_trend = 'n' elif self._trend == 'linear': self._sarima_trend = 't' elif self._trend == 'constant linear': self._sarima_trend = 'ct' elif self._trend in ['additive', 'add']: # self._sarima_logger.warningg("The trend " + str(self._trend) + " is not supported by SARIMA! " # "Assuming linear trend") self._sarima_trend = 't' elif self._trend in ['multiplicative', 'mul']: # self._sarima_logger.warning( # "The trend " + str(self._trend) + " is not supported by ARIMA! Assuming linear trend") self._sarima_trend = 't' def assertions(self): try: assert isinstance(self._s_order, tuple) except AssertionError: self._sarima_logger.exception( "Assertion exception occurred, tuple expected") sys.exit("STOP") try: assert (self.hyper_params is not None and len(self.hyper_params) != 0 and 'trend' in list(self.hyper_params.keys())) or ( self._sarima_trend is None or self._sarima_trend in ['n', 'c', 't', 'ct']) except AssertionError: self._sarima_logger.exception( "Assertion Error, trend must be in ['n', 'c', 't', 'ct']") sys.exit("STOP") try: assert isinstance(self._seasonal, bool) except AssertionError: self._sarima_logger.exception( "Assertion Error, seasonal must be boolean True/False in SARIMA" ) sys.exit("STOP") def __copy__(self): """Copies the object""" result = super(SARIMAForecaster, self).__copy__() result._s_order = self._s_order result._sarima_trend = self._sarima_trend result._sarima_logger = self._sarima_logger return result def set_params(self, p_dict=None, **kwargs): """Sets new parameters""" params_dict = kwargs if p_dict is not None: params_dict = p_dict # for k, v in params_dict.items(): if k == 'ts_df': self.ts_df = v elif k == 'freq': self.freq = v elif k == 'n_test': self.n_test = v elif k == 'n_val': self.n_val = v elif k == 'timeformat': self.time_format = v elif k == 's_order': self._s_order = v elif k == 'order': self._order = v elif k == 'test': self._test = v elif k == 'trend': self._sarima_trend = v self.assertions() return self def get_params_dict(self): """Gets parameters as a dictionary""" return { 'order': self._order, 'test': self._test, 'trend': self._sarima_trend, 's_order': self._s_order, } def ts_fit(self, suppress=False): """Fit Seasonal ARIMA to the time series data. Parameters: ---------- suppress: bool Suppress or not some of the output messages """ if self.hyper_params is not None: self._gs.set_forecaster(self) self._gs.set_hyper_params(self.hyper_params) # a very important command here to avoid endless loop self.hyper_params = None self._sarima_logger.info("***** Starting grid search *****") self._gs = self._gs.grid_search(suppress=suppress, show_plot=False) # self.best_model = self._gs.best_model self.__dict__.update(self.best_model['forecaster'].__dict__) self._sarima_logger.info("***** Finished grid search *****") else: self._prepare_fit() self.ts_split() self._init_trend() ts_df = self._train_dt.copy() # Fit self._sarima_logger.info("Trying to fit the sarima model....") # tic start = time() try: if not suppress: self._sarima_logger.info("...via using parameters\n") print_attributes(self) self._model = SARIMAX(ts_df['y'], order=self._order, seasonal_order=self._s_order, trend=self._sarima_trend, enforce_stationarity=False, enforce_invertibility=False, freq=self.freq) self.model_fit = self._model.fit(disp=1) except (Exception, ValueError): self._sarima_logger.exception( "Exception occurred in the fit...") self._sarima_logger.error("Please try other parameters!") self.model_fit = None else: # toc self._sarima_logger.info( "Time elapsed: {} sec.".format(time() - start)) self._sarima_logger.info( "Model successfully fitted to the data!") if not suppress: self._sarima_logger.info("The model summary: " + str(self.model_fit.summary())) # Fitted values self._sarima_logger.info( "Computing fitted values and residuals...") self.fittedvalues = self.model_fit.fittedvalues # prolong: for some reason this package returns fitted values this way if len(self.fittedvalues) != len(self._train_dt): self.fittedvalues = pd.DataFrame( index=pd.date_range(ts_df.index[0], ts_df.index[len(ts_df) - 1], freq=self.freq), columns=['dummy']).join(pd.DataFrame( self.fittedvalues)).drop(['dummy'], axis=1) self.fittedvalues = self.fittedvalues.reset_index() self.fittedvalues.columns = self._ts_df_cols self.fittedvalues.set_index('ds', inplace=True) self.fittedvalues.y = self.fittedvalues.y.fillna( method='bfill') # Residuals super(SARIMAForecaster, self)._residuals() self._sarima_logger.info("Done.") return self def plot_residuals(self): """Plot the residuals""" fig, axis = super(SARIMAForecaster, self)._plot_residuals( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues).flatten(), _id="SARIMA") plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def ts_test(self, show_plot=True): """Test the fitted model if test data available""" if super(SARIMAForecaster, self)._check_ts_test() < 0: return n_forecast = len(self._test_dt) if self._mode == 'test': self._sarima_logger.info( "Evaluating the fitted SARIMA model on the test data...") elif self._mode == 'test and validate': self._sarima_logger.info( "Evaluating the fitted SARIMA model on the test and validation data..." ) future = self.model_fit.predict(start=len(self._train_dt.index), end=len(self._train_dt.index) + n_forecast - 1, dynamic=True) self.forecast = pd.Series(future, index=self._test_dt.index) self.residuals_forecast = pd.Series(np.asarray(self._test_dt.y) - np.asarray(self.forecast), index=self._test_dt.index) self.measure_rmse() self._sarima_logger.info("RMSE on test data: {}".format(self.rmse)) # plot if show_plot: self.plot_forecast() return self def ts_forecast(self, n_forecast, suppress=False): """Forecast time series over time frame in the future specified via n_forecast""" # n_forecast = super(SARIMAForecaster, self)._check_ts_forecast(n_forecast) # self._sarima_logger.info("Fitting using all data....") self._mode = 'forecast' self.ts_fit(suppress=suppress) self._sarima_logger.info("Forecasting next " + str(n_forecast) + str(self.freq)) # future = self.model_fit.predict(start=len(self._train_dt.index), end=len(self._train_dt.index) + (n_forecast - 1), dynamic=True) idx_future = self._gen_idx_future(n_forecast=n_forecast) self.forecast = pd.Series(future, index=idx_future) # self.forecast = future self.residuals_forecast = None self.plot_forecast() return self def plot_forecast(self): """Plot forecasted values""" fig, axis = super(SARIMAForecaster, self)._plot_forecast( y=np.asarray(self._train_dt['y']), yhat=np.asarray(self.fittedvalues).flatten(), forecast=self.forecast, _id='SARIMA') plt.gcf().autofmt_xdate() plt.grid(True) plt.show()
class UVariateTimeSeriesClass(object): """Univariate time series class Attributes ---------- _ts_df_cols: list internal column names for dataframe that will be input to model ts_df: dataframe time series data frame freq: int frequency of time series; python format fill_method: str filling method for resampled data. Possible are 'ffill' and 'interp1d' n_test: int number of units (defined by frequency, e.g. 6 days) to use as test data. 0 would mean no test data is generated. n_val: int similar to n_test, for validation time_format: str time format if time series data needs to be brought into datetime _mode: str defines the mode as 'test' or 'forecast' _train_dt: dataframe training data _test_dt: dataframe test data _val_dt: dataframe validation data model_fit: fitted model fittedvalues: series computed fitted values residuals: series residuals rmse: float RMSE on test set (test data and the forecast on test data) _gs: GridSearchClass The grid search class for model optimization in case hyper_parameters are specified hyper_params: dictionary The dictionary of hyper parameters or None if no model optimization wished best_model: dictionary The best model resulted from the grid search upper_whisker_res: float upper whisker for residuals lower_conf_int: series lower confidence interval upper_conf_int: series upper confidence interval _trend: str or iterable, default=’c’ in AutoARIMA: ref. http://www.alkaline-ml.com/pmdarima/1.0.0/modules/generated/pmdarima.arima.auto_arima.html Parameter controlling the deterministic trend polynomial A(t). Can be specified as a string where ‘c’ indicates a constant (i.e. a degree zero component of the trend polynomial), ‘t’ indicates a linear trend with time, and ‘ct’ is both. Can also be specified as an iterable defining the polynomial as in numpy.poly1d, where [1,1,0,1] would denote a+bt+ct3. in ARIMA: A parameter for controlling a model of the deterministic trend as one of ‘nc’ or ’c’. ‘c’ includes constant trend, ‘nc’ no constant for trend. in SARIMA: A parameter for controlling a model of the deterministic trend as one of ‘n’,’c’,’t’,’ct’ for no trend, constant, linear, and constant with linear trend, respectively. in ExponentialSmoothing: The type of trend component, as either “add” for additive or “mul” for multiplicative. Modeling the trend can be disabled by setting it to None _test: list or str in ARIMA: list of possible tests for determining d in AutoARIMA. test for determining the value of d, e.g. 'adf' _seasonal: bool or str in AutoARIMA Seasonal component yes/no in ExponentialSmoothing: The type of seasonal component, as either “add” for additive or “mul” for multiplicative. Modeling the seasonal component can be disabled by setting it to None _seasonal_periods: int The number of time steps in a seasonal period, e.g. 12 for 12 months in a yearly seasonal structure forecast: series computed forcatsed values residuals_forecast: series residuals between forecasted and real values. Note, this variable exist only if test data existed Methods ------- ts_transform() Transforms time series using log10 or box-cox ts_resample() Resamples time series at the chosen frequency freq ts_test() Evaluates fitted model on the test data, if this one has been generated ts_forecast() Forecasts time series and plots the results ts_decompose() Decomposes time series in _arr_seasonal, _arr_trend, residual(irregular) and _arr_baseline, and plots the results plot_decompose() Plots the results of ts_decompose() difference() Differences the time series given the lag (parameter interval) rolling_mean() Computes moving average given the window size rolling_variance() Computes moving variance given the window size test_adf(): ADF test for stationarity test_kpss(): KPSS test for stationarity ndiff() Determines value for diff parameter d All tests given in the parameter tests are applied acf_plots() Generates autocorrelation plots pacf_plots() Generates partial correlation plots Helper methods: ------- _plot_residuals() Residual plots helper function _plot_forecast() Helper function for plotting forecasted time-series _prepare_fit() Prepares ts_fit of child class. Supposed to be called by a child class _residuals() Helper function for calculating residuals. Supposed to be called by a child class _check_ts_test() Checks for test. Supposed to be called by a child class _check_ts_forecast() Checks for forecast. Supposed to be called by a child class """ def __init__(self, ts_df, time_format="%Y-%m-%d %H:%M:%S", freq='D', fill_method='ffill', n_test=0, n_val=0, hyper_params=None, test='adf', trend=None, seasonal=False, seasonal_periods=1, **kwds): """Initializes the object UVariateTimeSeriesForecaster""" self._ts_df_cols = ['ds', 'y'] self.ts_df = ts_df self.time_format = time_format self.freq = freq self.fill_method = fill_method.lower() self.n_test = int(n_test) self.n_val = int(n_val) self.transform = None self._boxcox_lmbda = None self._mode = '' self._train_dt = None self._test_dt = None self._val_dt = None self.model_fit = None self.fittedvalues = None self.residuals = None self.rmse = 0 self._gs = tsa.GridSearchClass() self.hyper_params = hyper_params self.best_model = dict() """ self.rmse_test = 0 self.rmse_val = 0 """ self.upper_whisker_res = None self.lower_conf_int = None self.upper_conf_int = None self.forecast = None self.residuals_forecast = None self._res_decomp = None self._arr_seasonal = None self._arr_trend = None self._arr_baseline = None self._test = test self._trend = trend if self._trend is not None: self._trend = self._trend.lower() self._seasonal = seasonal if isinstance(self._seasonal, str): self._seasonal = self._seasonal.lower() self._seasonal_periods = seasonal_periods self._uvts_cls_logger = Logger('uvts_cls') UVariateTimeSeriesClass.assertions(self) # work with ts_df self.ts_df = self.ts_df.reset_index() self.ts_df.columns = self._ts_df_cols self.ts_df['y'] = self.ts_df['y'].apply(np.float64, errors='coerce') self.ts_df.set_index('ds', inplace=True) self._uvts_cls_logger.info("Received time series data of range: " + str(min(self.ts_df.index)) + ' - ' + str(max(self.ts_df.index)) + " and shape: " + str(self.ts_df.shape)) if not isinstance(self.ts_df.index, pd.DatetimeIndex): self._uvts_cls_logger.warning("Time conversion required...") self.ts_df = self.ts_df.reset_index() try: self.ts_df['ds'] = self.ts_df['ds'].apply( lambda x: datetime.datetime.strptime( str(x).translate({ ord('T'): ' ', ord('Z'): None })[:-1], self.time_format)) except ValueError as e: self._uvts_cls_logger.warning( "Zulu time conversion not successful: {}".format(e)) self._uvts_cls_logger.warning( "Will try without assuming zulu time...") try: self.ts_df['ds'] = self.ts_df['ds'].apply( lambda x: datetime.datetime.strptime( str(x), self.time_format)) except ValueError as e: self._uvts_cls_logger.info( "Time conversion not successful. Check your time_format: {}" .format(e)) sys.exit("STOP") else: self._uvts_cls_logger.info("Time conversion successful!") else: self._uvts_cls_logger.info("Time conversion successful!") # set index self.ts_df.set_index('ds', inplace=True) # self.ts_df.index = pd.to_datetime(self.ts_df.index) self.ts_df.sort_index(inplace=True) # resample self.ts_resample() UVariateTimeSeriesClass.assertions(self, post=True) # if self.n_val > len(self.ts_df) - self.n_test: self.n_val = len(self.ts_df) - self.n_test if self.n_test == 0 and self.n_val == 0: self._mode = 'forecast' elif self.n_test > 0: self._mode = 'test' elif self.n_test == 0 and self.n_val > 0: self._mode = 'validate' # delegate just for good programming style here super(UVariateTimeSeriesClass, self).__init__(**kwds) def assertions(self, post=False): if post: try: assert 0 <= self.n_test < len(self.ts_df) except AssertionError: self._uvts_cls_logger.exception( "Assertion exception, invalid value for n_test!") sys.exit("STOP") # try: assert 0 <= self.n_val < len(self.ts_df) except AssertionError: self._uvts_cls_logger.exception( "Assertion exception, invalid value for n_val!") sys.exit("STOP") else: try: assert self.fill_method in ['ffill', 'interp1d'] except AssertionError: self._uvts_cls_logger.exception( "Assertion exception, fill method not recognized! " "'ffill' will be used. ") else: self.fill_method = 'ffill' try: assert pd.DataFrame(self.ts_df).shape[1] <= 2 except AssertionError: self._uvts_cls_logger.exception( "Time series must be uni-variate. " "Hence, at most a time columns and a column of numeric values are expected!" ) sys.exit("STOP") try: self._trend is None or (isinstance( self._trend, str) and self._trend in [ 'constant', 'linear', 'constant linear', 'additive', 'add', 'multiplicative', 'mul' ]) except AssertionError: self._uvts_cls_logger.exception( "Assertion exception occurred, invalid value for trend! " "Choose between None or " "['constant', 'linear ','constant linear', " "'additive', 'add , 'multiplicative', 'mul'] ") sys.exit("STOP") try: self._seasonal is None or isinstance(self._seasonal, bool) or ( isinstance(self._seasonal, str) and self._seasonal in ['additive', 'add', 'multiplicative', 'mul']) except AssertionError: self._uvts_cls_logger.exception( "Assertion exception occurred, invalid value for seasonal! " "Choose between True/False, None or " "['additive', 'add , 'multiplicative', 'mul'] ") sys.exit("STOP") def __copy__(self): """Copies the object""" cls = self.__class__ result = cls.__new__(cls) result.__dict__.update(self.__dict__) return result def ts_transform(self, transform): """Transforms time series via applying casted 'transform'. Right now 'log10' and 'box-cox' possible.""" try: assert transform.lower().strip() in ['log10', 'box-cox'] except AssertionError: self._uvts_cls_logger.error( "transform should be in ['log10', 'box-cox'] or empty. Assuming no transform! " "Hence, if you get bad results, you would like maybe to choose e.g., log10 here." ) self._uvts_cls_logger.exception( "Assertion exception occurred, transform") self.transform = None else: self.transform = transform.lower() # transform if sum(self.ts_df['y'] > 0) < len(self.ts_df['y']): self._uvts_cls_logger.warning( "Zero, negative, or both values present in your data. Transformation will not be used!" ) return self if self.transform == 'log10': try: self.ts_df['y'] = self.ts_df['y'].apply(np.log10) except ValueError: self._uvts_cls_logger.exception( "log10 transformation did not work! Possibly negative " "values present?") elif self.transform == 'box-cox': if input("Do you want to provide lambda for box.cox? y/n?" ).strip().lower() == 'y': self._boxcox_lmbda = float(input()) else: self._boxcox_lmbda = None try: if self._boxcox_lmbda is None: bc, lmbda_1 = stats.boxcox(self.ts_df['y'], lmbda=self._boxcox_lmbda) self.ts_df['y'] = stats.boxcox(self.ts_df['y'], lmbda=lmbda_1) else: self.ts_df['y'] = stats.boxcox( self.ts_df['y'], lmbda=self._boxcox_lmbda) except ValueError: self._uvts_cls_logger.exception( "box-cox transformation did not work! " "Possibly negative values present or bad lambda?") return self def set_frequency(self, new_freq): """Sets new frequency and resamples time series to that new frequency""" self.freq = new_freq self.ts_resample() def ts_check_frequency(self): """Checks the frequency of time series""" if self.ts_df.index.freq is None: self._uvts_cls_logger.info("No specific frequency detected.") self._uvts_cls_logger.info( "Frequency chosen in initialization: " + str(self.freq) + " enter 'n' and call ts_resample() if you are satisfied with this value." ) if input("Should a histogram of time deltas be plotted y/n?" ).strip().lower() == 'y': ff = pd.Series(self.ts_df.index[1:(len(self.ts_df))] - self.ts_df.index[0:(len(self.ts_df) - 1)]) ff = ff.apply(lambda x: int(x.total_seconds() / (60 * 60))) plt.hist(ff, bins=120) plt.xlabel("Rounded time delta [H]") plt.ylabel("Frequency of occurrence") self._uvts_cls_logger.info(ff.value_counts()) self._uvts_cls_logger.info( "Should hourly frequency not fit, choose a reasonable frequency and call " "set_frequency(new_freq)") else: pass else: self._uvts_cls_logger.info("Time series frequency: " + str(self.ts_df.index.freq)) def ts_resample(self): """Brings original time series to the chosen frequency freq""" try: ts_freq = pd.DataFrame(index=pd.date_range( self.ts_df.index[0], self.ts_df.index[len(self.ts_df) - 1], freq=self.freq), columns=['dummy']) except ValueError: self._uvts_cls_logger.exception( "Exception occurred, possibly incompatible frequency!") sys.exit("STOP") if self.fill_method == 'ffill': self.ts_df = ts_freq.join(self.ts_df).drop(['dummy'], axis=1) self.ts_df.y = self.ts_df.y.fillna(method='ffill') # if np.isnan ( self.ts_df.y ).any (): # self.ts_df.y = self.ts_df.y.fillna ( method='bfill' ) else: # interp xp = np.linspace(0, self.ts_df.size, self.ts_df.size, endpoint=False) fp = self.ts_df['y'] # join self.ts_df = ts_freq.join(self.ts_df).drop(['dummy'], axis=1) # pick new points x = np.linspace(0, ts_freq.size, ts_freq.size, endpoint=False) x = x[self.ts_df['y'].isna()] print(x.size) print(x) # put the values self.ts_df.y[self.ts_df['y'].isna()] = np.interp(x, xp, fp) if np.isnan(self.ts_df.y).any(): self._uvts_cls_logger.warning( "Some NaN found, something went wrong, check the data!") sys.exit("STOP") self._uvts_cls_logger.info("Time series resampled at frequency: " + str(self.ts_df.index.freq) + ". New shape of the data: " + str(self.ts_df.shape)) self._uvts_cls_logger.info("Using time series data of range: " + str(min(self.ts_df.index)) + ' - ' + str(max(self.ts_df.index)) + " and shape: " + str(self.ts_df.shape)) return self def ts_split(self): """Prepares data for different modes: train, test, validate, test and validate, forecast""" if self.ts_df.index.freq is None: self._uvts_cls_logger.warning( "Time series exhibit no frequency. Calling ts_resample()...") try: self.ts_resample() except ValueError: self._uvts_cls_logger.error("Resample did not work! Error:" + str(sys.exc_info()[0])) ts_df = self.ts_df if self._mode == 'forecast': self._train_dt = ts_df self._test_dt, self._val_dt = None, None elif self._mode == 'test and validate': if self._test_dt is not None: self._train_dt = pd.concat([self._train_dt, self._test_dt], axis=0) self._test_dt = self._val_dt self._val_dt = None else: self._uvts_cls_logger.error("Something is wrong: mode!") else: # split ts_test_df = pd.DataFrame() ts_val_df = pd.DataFrame() # ts_df = ts_df.reset_index() ts_df.columns = self._ts_df_cols if self._mode == 'test' and self.n_val == 0: ts_test_df = ts_df.copy() # ts_df = pd.DataFrame(ts_df.loc[:(len(ts_df) - 1 - self.n_test), ]) ts_df.set_index('ds', inplace=True) # test ts_test_df = pd.DataFrame(ts_test_df.loc[(len(ts_test_df) - self.n_test):, ]) ts_test_df.set_index('ds', inplace=True) elif self._mode == 'validate': ts_val_df = ts_df.copy() # ts_df = pd.DataFrame(ts_df.loc[:(len(ts_df) - 1 - self.n_val), ]) ts_df.set_index('ds', inplace=True) # val ts_val_df = pd.DataFrame(ts_val_df.loc[(len(ts_val_df) - self.n_val):, ]) ts_val_df.set_index('ds', inplace=True) elif self._mode == 'test' and self.n_val > 0: ts_test_df = ts_df.copy() ts_val_df = ts_df.copy() # ts_df = pd.DataFrame(ts_df.loc[:(len(ts_df) - 1 - self.n_test - self.n_val), ]) ts_df.set_index('ds', inplace=True) # test ts_test_df = pd.DataFrame( ts_test_df.loc[(len(ts_test_df) - self.n_test - self.n_val):(len(ts_test_df) - self.n_val - 1)]) ts_test_df.set_index('ds', inplace=True) # val ts_val_df = pd.DataFrame(ts_val_df.loc[(len(ts_val_df) - self.n_val):, ]) ts_val_df.set_index('ds', inplace=True) # now set self._train_dt = ts_df if not ts_test_df.empty: self._test_dt = ts_test_df if not ts_val_df.empty: self._val_dt = ts_val_df return self @staticmethod def compute_ci(yhat, yhat_var, ci_level): """Easy compute of confidence intervals""" z_mapping = {0.95: 1.96, 0.99: 2.58} z = z_mapping[ci_level] ci_lower = yhat - yhat_var * z ci_upper = yhat + yhat_var * z return ci_lower, ci_upper def _prepare_fit(self): """Helper function ro prepare ts_fit""" self.lower_conf_int, self.upper_conf_int, self.upper_whisker_res = None, None, None self.model_fit = None self.residuals, self.residuals_forecast, self.fittedvalues = None, None, None def _residuals(self): """Helper function to calculate residuals""" if self.model_fit is None: self._uvts_cls_logger.error( "No model has been fitted, residuals cannot be computed!") sys.exit("STOP") try: # use fittedvalues to fill in the model dictionary self.residuals = pd.Series(np.asarray(self._train_dt['y']) - np.asarray(self.fittedvalues).flatten(), index=self._train_dt['y'].index) self.upper_whisker_res = self.residuals.mean() + 1.5 * ( self.residuals.quantile(0.75) - self.residuals.quantile(0.25)) except (KeyError, AttributeError): self._uvts_cls_logger.exception( "Exception occurred: Model was not fitted or ts has other structure" ) return self def _plot_residuals(self, y, yhat, _id): """Helper function to plot the residuals""" try: assert self.model_fit is not None except AssertionError: self._uvts_cls_logger.exception( "Model has to be fitted first! Please call ts_fit(...)") fig, axes = plt.subplots(2, 1, figsize=(20, 5), sharex=True) axes[0].plot(pd.Series(yhat, index=self._train_dt.index), color='y', linewidth=2.0) axes[0].plot(pd.Series(y, index=self._train_dt.index), color='b') axes[0].set_ylabel("Model Fit") axes[0].set_title("Real (blue) and estimated values, " + str(_id)) # axes[1].plot(self.residuals, color="r") """ if self.forecast is not None and self.residuals_forecast is None \ and self.lower_conf_int is not None and self.upper_conf_int is not None: axes[0].fill_between(self.lower_conf_int.index, self.lower_conf_int, self.upper_conf_int, color='k', alpha=.15) """ if self.lower_conf_int is not None and self.upper_conf_int is not None: axes[0].fill_between(self.lower_conf_int.index, self.lower_conf_int, self.upper_conf_int, color='k', alpha=.15) if self.upper_whisker_res is not None: axes[1].axhline(y=self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[1].axhline(y=-self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[1].set_ylabel('Residuals') axes[1].set_title( 'Difference between model output and the real data and +/- upper whisker, ' + str(_id)) return fig, axes def _check_ts_test(self): """Check before ts_test in child class is called""" try: assert self.model_fit is not None except AssertionError: self._uvts_cls_logger.exception( "Model has to be fitted first! Please call ts_fit(...)") try: assert self._test_dt is not None except (KeyError, AssertionError): self._uvts_cls_logger.exception( "Nothing to test. " "Call ts_forecast() or specify amount of test data " "when initializing the object.") return -1 else: # self._mode = 'test' return 0 def _check_ts_forecast(self, n_forecast): """Check before ts_forecast in child class is called""" # try: n_forecast = int(n_forecast) assert 0 < n_forecast < len(self._train_dt) except AssertionError: self._uvts_cls_logger.exception( "Number of periods to be forecasted is too low, too high or not numeric!" ) except ValueError: self._uvts_cls_logger.exception( "n_forecast must be convertible to int type!") return n_forecast def _gen_idx_future(self, n_forecast): """Generate the time axis for future data""" idx_future = None if self.freq == 'S': idx_future = pd.date_range(start=max(self._train_dt.index) + datetime.timedelta(seconds=1), end=max(self._train_dt.index) + datetime.timedelta(seconds=n_forecast), freq='S') elif self.freq == 'min': idx_future = pd.date_range(start=max(self._train_dt.index) + datetime.timedelta(minutes=1), end=max(self._train_dt.index) + datetime.timedelta(minutes=n_forecast), freq='min') elif self.freq == 'H': idx_future = pd.date_range(start=max(self._train_dt.index) + datetime.timedelta(hours=1), end=max(self._train_dt.index) + datetime.timedelta(hours=n_forecast), freq='H') elif self.freq == 'D': idx_future = pd.date_range(start=max(self._train_dt.index) + datetime.timedelta(days=1), end=max(self._train_dt.index) + datetime.timedelta(days=n_forecast), freq='D') elif self.freq == 'W': idx_future = pd.date_range(start=max(self._train_dt.index) + datetime.timedelta(weeks=1), end=max(self._train_dt.index) + datetime.timedelta(weeks=n_forecast), freq='W') elif self.freq == 'M' or self.freq == 'MS': idx_future = pd.date_range(start=max(self._train_dt.index) + relativedelta(months=+1), end=max(self._train_dt.index) + relativedelta(months=+n_forecast), freq=self.freq) return idx_future def _prepare_forecast(self, yhat, forecast): # forecast forecast = forecast.reset_index() forecast.columns = self._ts_df_cols forecast.set_index('ds', inplace=True) # vals = list() vals.append(yhat[-1]) for i in range(len(forecast['y'])): vals.append(forecast['y'][i]) idx = list() idx.append(self._train_dt.index[-1]) for i in range(len(forecast.index)): idx.append(forecast.index[i]) # return pd.Series(vals, index=idx) def _plot_forecast(self, y, yhat, forecast, _id): """Helper function to plot forecasted values""" try: assert self.model_fit is not None except AssertionError: self._uvts_cls_logger.exception( "Model has to be fitted first! Please call ts_fit(...)") sys.exit("STOP") # try: assert self.forecast is not None except AssertionError: self._uvts_cls_logger.exception( "Neither ts_test(...) nor ts_forecast(...) have been called yet!" ) sys.exit("STOP") fig, axes = plt.subplots(2, 1, figsize=(20, 7), sharex=True) # axes[0].plot(pd.Series(yhat, index=self._train_dt.index), color='y', linewidth=2.0) axes[0].plot(pd.Series(y, index=self._train_dt.index), color='b', linewidth=1.0) # if self.residuals_forecast is not None: axes[0].plot(self.ts_df, color='b') forecast = self._prepare_forecast(yhat=yhat, forecast=forecast) axes[0].plot(forecast, color='orange', linewidth=2.0) # if self.lower_conf_int is not None and self.upper_conf_int is not None: axes[0].fill_between(self.lower_conf_int.index, self.lower_conf_int, self.upper_conf_int, color='k', alpha=.15) axes[0].set_ylabel("Fit and Forecast/Validation") axes[0].set_title( "Real (blue), estimated (yellow) and forecasted values, " + str(_id)) # if self.residuals_forecast is not None: axes[1].plot(pd.concat([self.residuals, self.residuals_forecast], axis=0), color='r') axes[1].plot(self.residuals, color="r") if self.upper_whisker_res is not None: axes[1].axhline(y=self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[1].axhline(y=-self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[1].set_ylabel("Residuals") axes[1].set_title( "Difference between model output and the real data both, for fitted " "and forecasted and +/- upper whisker or confidence intervals, " + str(_id)) return fig, axes def ts_decompose(self, params=None): """Decomposes time series""" self._res_decomp = None self._arr_seasonal = None self._arr_trend = None self._arr_baseline = None self.residuals = None if params is None: params = dict({'model': 'additive', 'freq': 1}) try: assert isinstance(params, dict) except AssertionError: self._uvts_cls_logger.exception( "Dictionary is expected for parameters!") sys.exit("STOP") try: assert 'model' in list(params.keys()) except AssertionError: self._uvts_cls_logger.exception( "Unexpected dictionary keys. At least decomposition " "model must be supplied!") sys.exit("STOP") try: assert params['model'].lower() in ['additive', 'multiplicative'] except AssertionError: self._uvts_cls_logger.exception( "Unexpected value for the parameter 'model'! " "Choose from ['additive', 'multiplicative']") sys.exit("STOP") else: params['model'] = params['model'].lower() if 'freq' not in list(params.keys()): params['freq'] = 1 try: ts2decomp = self.ts_df if 'from' in list(params.keys()): ts2decomp = ts2decomp[ ts2decomp.index >= datetime.datetime.strptime( params['from'], self.time_format)] if 'to' in list(params.keys()): ts2decomp = ts2decomp[ts2decomp.index <= datetime.datetime. strptime(params['to'], self.time_format)] try: assert ts2decomp.size > 0 except AssertionError: self._uvts_cls_logger.exception( "Empty time series resulted, please check your parameters!" ) sys.exit("STOP") if ts2decomp.index.freq is not None: res = seasonal_decompose(ts2decomp.loc[:, 'y'], model=params['model']) else: res = seasonal_decompose(ts2decomp.loc[:, 'y'], model=params['model'], freq=params['freq']) except ValueError: self._uvts_cls_logger.exception( "ValueError, seasonal_decompose error") else: self._res_decomp = res self._arr_seasonal = res.seasonal self._arr_trend = res.trend self._arr_baseline = self._arr_seasonal + self._arr_trend self.residuals = res.resid self.upper_whisker_res = self.residuals.mean() + 1.5 * ( self.residuals.quantile(0.75) - self.residuals.quantile(0.25)) self.plot_decompose() def ts_stl_decompose(self, params=None): self._res_decomp = None self._arr_seasonal = None self._arr_trend = None self._arr_baseline = None self.residuals = None if params is None: params = dict({'period': 12}) try: assert isinstance(params, dict) except AssertionError: self._uvts_cls_logger.exception( "Dictionary is expected for parameters!") sys.exit("STOP") try: assert 'period' in list(params.keys()) except AssertionError: self._uvts_cls_logger.exception( "Unexpected dictionary keys. At least decomposition " "period must be supplied!") sys.exit("STOP") try: assert isinstance(params['period'], int) except AssertionError: self._uvts_cls_logger.exception( "Unexpected value for the parameter 'period'! " "Integer expected") sys.exit("STOP") try: ts2decomp = self.ts_df if 'from' in list(params.keys()): ts2decomp = ts2decomp[ ts2decomp.index >= datetime.datetime.strptime( params['from'], self.time_format)] if 'to' in list(params.keys()): ts2decomp = ts2decomp[ts2decomp.index <= datetime.datetime. strptime(params['to'], self.time_format)] try: assert ts2decomp.size > 0 except AssertionError: self._uvts_cls_logger.exception( "Empty time series resulted, please check your parameters!" ) sys.exit("STOP") res = decompose(ts2decomp, period=params['period']) except ValueError: self._uvts_cls_logger.exception("ValueError, stl_decompose error") else: self._res_decomp = res self._arr_seasonal = res.seasonal self._arr_trend = res.trend self._arr_baseline = self._arr_seasonal + self._arr_trend self.residuals = res.resid self.upper_whisker_res = np.asarray(self.residuals.mean() + 1.5 * ( self.residuals.quantile(0.75) - self.residuals.quantile(0.25))) self.plot_decompose() def plot_decompose(self): """Plots the results of time series decomposition""" try: assert self._arr_seasonal is not None except AssertionError: self.ts_decompose() fig, axes = plt.subplots(5, 1, figsize=(20, 9), sharex=True) axes[0].plot(self._res_decomp.observed) axes[0].set_ylabel("Original") # axes[1].plot(self._arr_trend) axes[1].set_ylabel("Trend") # axes[2].plot(self._arr_seasonal) axes[2].set_ylabel("Seasonal") # axes[3].plot(self._arr_baseline) axes[3].set_ylabel("Baseline") # axes[4].plot(self.residuals) axes[4].set_ylabel("Residuals") # if self.upper_whisker_res is not None: axes[4].axhline(y=self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) axes[4].axhline(y=-self.upper_whisker_res, xmin=0, xmax=1, color='m', label='upper_whisker', linestyle='--', linewidth=1.5) plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def difference(self, lag=1): diff = list() for i in range(lag, len(self.ts_df)): value = self.ts_df['y'][i] - self.ts_df['y'][i - lag] diff.append(value) return pd.Series(diff) def rolling_mean(self, window=10): return self.ts_df.rolling(window=window).mean() def rolling_variance(self, window=10): return self.ts_df.rolling(window=window).std() def test_adf(self): """Performs Dickey-Fuller test for stationarity""" dftest = adfuller(self.ts_df['y'], autolag='AIC') dfoutput = pd.Series(dftest[0:4], index=[ 'Test Statistic', 'p-value', '#Lags Used', 'Number of Observations Used' ]) for key, value in dftest[4].items(): dfoutput['Critical Value (%s)' % key] = value print(dfoutput) if dftest[0] > dftest[4]['5%']: print( "Test statistic greater than critical value at 5% --> series seems to be not stationary. " "Look at critical values at 1% and 10% too, ideally they also should be less than test statistic." ) else: print( "Test statistic less than critical value at 5% --> series seems to be stationary. " "Look at critical values at 1% and 10% too, ideally they also should be greater than test statistic." ) def test_kpss(self): """Performs Kwiatkowski-Phillips-Schmidt-Shin test for stationarity""" kpsstest = kpss(self.ts_df['y'], regression='c') kpss_output = pd.Series( kpsstest[0:3], index=['Test Statistic', 'p-value', 'Lags Used']) for key, value in kpsstest[3].items(): kpss_output['Critical Value (%s)' % key] = value print(kpss_output) if kpsstest[0] > kpsstest[3]['5%']: print( "Test statistic greater than critical value at 5% --> series seems to be not stationary. " "Look at critical values at 1% and 10% too, ideally they also should be greater than test statistic." ) else: print( "Test statistic less than critical value at 5% --> series seems to be stationary. " "Look at critical values at 1% and 10% too, ideally they also should be less than test statistic." ) def ndiff(self, tests=['kpss', 'adf', 'pp'], alpha=0.05, max_d=2): """Returns p-values to decide for the value of d-differentiation list of tests given in tests parameter are applied. """ try: assert sum([i in ['kpss', 'adf', 'pp'] for i in tests]) > 0 except AssertionError: self._uvts_cls_logger.exception( "Assertion exception occurred. No valid value for tests! " "Choose from ['kpss', 'adf', 'pp']. You can choose more than one." ) sys.exit("STOP") do_test = list( compress(['kpss', 'adf', 'pp'], [i in ['kpss', 'adf', 'pp'] for i in tests])) return dict( zip( do_test, list( map( lambda x: ndiffs( self.ts_df['y'], test=x, alpha=alpha, max_d=max_d), do_test)))) def acf_plots(self): """Generates autocorrelation plots""" fig, axes = plt.subplots(3, 2, figsize=(20, 9), sharex=False) # axes[0, 0].plot(self.ts_df['y']) axes[0, 0].set_title('Original Series') plot_acf(self.ts_df['y'], ax=axes[0, 1]) # 1st Differencing axes[1, 0].plot(self.ts_df['y'].diff()) axes[1, 0].set_title('1st Order Differencing') plot_acf(self.ts_df['y'].diff().dropna(), ax=axes[1, 1]) # 2nd Differencing axes[2, 0].plot(self.ts_df['y'].diff().diff()) axes[2, 0].set_title('2nd Order Differencing') plot_acf(self.ts_df['y'].diff().diff().dropna(), ax=axes[2, 1]) # plt.gcf().autofmt_xdate() plt.grid(True) plt.show() def pacf_plots(self): """Generates partial correlation plots""" fig, axes = plt.subplots(3, 2, figsize=(20, 9), sharex=False) # axes[0, 0].plot(self.ts_df['y']) axes[0, 0].set_title('Original Series') plot_pacf(self.ts_df['y'], ax=axes[0, 1]) # 1st Differencing axes[1, 0].plot(self.ts_df['y'].diff()) axes[1, 0].set_title('1st Order Differencing') # axes[0].set(ylim=(0, 5)) plot_pacf(self.ts_df['y'].diff().dropna(), ax=axes[1, 1]) # 2nd Differencing axes[2, 0].plot(self.ts_df['y'].diff().diff()) axes[2, 0].set_title('2nd Order Differencing') plot_pacf(self.ts_df['y'].diff().diff().dropna(), ax=axes[2, 1]) plt.gcf().autofmt_xdate() plt.grid(True) plt.show() @abstractmethod def ts_fit(self, suppress=True): self.model_fit = None raise NotImplementedError("You must override ts_fit!") @abstractmethod def ts_test(self, show_plot=True): raise NotImplementedError("You must override ts_test!") def measure_rmse(self): """Computes root mean squared error on test data """ try: assert self.residuals_forecast is not None except AssertionError: self._uvts_cls_logger.exception( "AssertionError occurred, Cannot compute RMSE! Check your object mode" ) self.rmse = np.sqrt( sum(np.square(self.residuals_forecast)) / len(self.residuals_forecast)) """ if self._mode == 'test': self.rmse_test = self.rmse elif self._mode == 'test and validate': self.rmse_val = self.rmse - self.rmse_test elif self._mode == 'validate': self.rmse_val = self.rmse """ def ts_validate(self, suppress=True, show_plot=True): """Validates the model""" if self._mode == 'forecast': # or self._val_dt is None: self._uvts_cls_logger.warning( "Nothing to validate! n_val not set within the initialization or you already " "used ts_forecast. In this case you have to restart and call ts_fit()." ) sys.exit("STOP") self._mode = 'test and validate' self.ts_fit(suppress=suppress) self.ts_test(show_plot=show_plot) def reset(self): for attr in self.__dict__.keys(): setattr(self, attr, None)
class GridSearchClass(object): """Class to perform the grid search given the hyper parameters Attributes ---------- forecaster: Object (tsa) Forecaster object from the tsa package hyper_params: dictionary A dictionary of hyper parameters results: dictionary A dictionary where results are saved best_model: dictionary A dictionary where the best model and respective hyper parameters are saved _gs_logger: Logger The logger for logging Methods ---------- assertions() Assertion tests set_forecaster() Sets new forecaster set_hyper_params() Sets new hyper parameters grid_search() Performs grid search trough all combinations of parameters. Parameter combinations are generated using hyper parameters """ def __init__(self, **kwargs): """Initializes GridSearch class""" self._gs_logger = Logger("grid_search") self.forecaster = None self.hyper_params = None for k, v in kwargs.items(): if k == 'forecaster': self.forecaster = v elif k == 'hyper_params': self.hyper_params = v self.assertions() self.results = list() self.best_model = dict() # # self._gs_logger.info("Grid Search initialized. Call grid_search()") def assertions(self): if self.forecaster is not None: try: assert (isinstance(self.forecaster, ProphetForecaster) or isinstance(self.forecaster, DLMForecaster) \ or isinstance(self.forecaster, LinearForecaster) or \ isinstance(self.forecaster, ExponentialSmoothingForecaster) or \ isinstance(self.forecaster, ARIMAForecaster) or isinstance(self.forecaster, SARIMAForecaster)) \ and not isinstance(self.forecaster, UVariateTimeSeriesForecaster) except AssertionError: self._gs_logger.exception("Unexpected type for forecaster!") sys.exit("STOP") if self.hyper_params is not None: try: assert isinstance(self.hyper_params, dict) except AssertionError: self._gs_logger.exception("Unexpected type for hyper_params") sys.exit("STOP") if hasattr(self.forecaster, 'n_test'): try: assert self.forecaster.n_test > 0 except AssertionError: self._gs_logger.exception("No test data specified for this forecaster. Grid search will stop!") sys.exit("STOP") else: self.forecaster._mode = 'test' def set_forecaster(self, forecaster_obj): """Sets the forecaster""" self.forecaster = forecaster_obj self.assertions() return self def set_hyper_params(self, hyper_params): """Sets hyper parameters""" self.hyper_params = hyper_params self.assertions() return self @staticmethod def _print_dict(d): d_info = "" for k,v in d.items(): d_info = d_info + "....................... | grid_search | INFO : " + str(k) + " : " + str(v) + "\n" return "Hyper parameter set: \n" + d_info def grid_search(self, suppress=False, show_plot=True): """Performs the grid search Via generating all possible combinations of parameters. The combinations are derived from the hyper parameters. This method assumes that attributes of a forecaster start with '_' The best model is chosen using rmse computed on the test data as the measure for the goodness of the forecaster """ # set-up parameter sets for p, v in self.hyper_params.items(): if not isinstance(v, list): self.hyper_params[p] = [v] combinations = list(itertools.product(*list(self.hyper_params.values()))) params = [dict(zip(list(self.hyper_params.keys()), combinations[i])) for i in range(len(combinations))] self._gs_logger.info("{} number of parameter combinations generated".format(len(params))) #if input("Run grid search y/n?").strip().lower() == 'y': # reset self.results = list() self.best_model = dict() rmse = np.float('Inf') for i in range(len(params)): self._gs_logger.info(self._print_dict(params[i])) for p, val in params[i].items(): # check attr = '_'+str(p) if attr in list(self.forecaster.__dict__.keys()): _type = type(getattr(self.forecaster, attr)) try: assert type(val) == _type except AssertionError: try: if str(_type) == 'float': val = np.float(val) elif str(_type) == 'int': val = np.int(val) elif str(_type) == 'bool': val = np.bool(val) elif str(_type) == 'str': val = str(val) elif str(_type) == 'NoneType': pass self._gs_logger.info("Parameter type mismatch found, however, conversion successful") except ValueError: self._gs_logger.exception("Parameter type mismatch: Conversion did not work, " "please check your hyper parameters!") raise setattr(self.forecaster, attr, val) else: self._gs_logger.warning("Attribute {} not found. Default value will be used only.".format(attr)) pass # call ts_fit() and ts_test() # tic start = time() self.forecaster.ts_fit(suppress=suppress) self.forecaster.ts_test(show_plot=show_plot) # toc time_elapsed = time() - start # current_results = dict() current_results['params'] = self.forecaster.get_params_dict() current_results['rmse'] = self.forecaster.rmse current_results['time_elapsed'] = time_elapsed self.results.append(current_results) # if self.results[i]['rmse'] < rmse: rmse = self.results[i]['rmse'] self.best_model['forecaster'] = self.forecaster.__copy__() self.best_model['hyper_params'] = self.results[i]['params'] self.best_model['rmse'] = self.results[i]['rmse'] self.best_model['time_elapsed'] = self.results[i]['time_elapsed'] self._gs_logger.info("Best parameter combination:") self._gs_logger.info(self._print_dict(self.best_model['hyper_params'])) self._gs_logger.info("RMSE {} :".format(self.best_model['rmse'])) # else: # self._gs_logger.info("OK") return self