def model_local_linear_trend(endog=None, params=None, direct=False): if endog is None: y1 = 10.2394 y2 = 4.2039 y3 = 6.123123 endog = np.r_[y1, y2, y3, [1] * 7] if params is None: params = [1.993, 8.253, 2.334] sigma2_y, sigma2_mu, sigma2_beta = params if direct: mod = None # Construct the basic representation ssm = KalmanSmoother(k_endog=1, k_states=2, k_posdef=2) ssm.bind(endog) init = Initialization(ssm.k_states, initialization_type='diffuse') ssm.initialize(init) # ssm.filter_univariate = True # should not be required # Fill in the system matrices for a local level model ssm['design', 0, 0] = 1 ssm['obs_cov', 0, 0] = sigma2_y ssm['transition'] = np.array([[1, 1], [0, 1]]) ssm['selection'] = np.eye(2) ssm['state_cov'] = np.diag([sigma2_mu, sigma2_beta]) else: mod = UnobservedComponents(endog, 'lltrend') mod.update(params) ssm = mod.ssm ssm.initialize(Initialization(ssm.k_states, 'diffuse')) return mod, ssm
def model_local_level(endog=None, params=None, direct=False): if endog is None: y1 = 10.2394 endog = np.r_[y1, [1] * 9] if params is None: params = [1.993, 8.253] sigma2_y, sigma2_mu = params if direct: mod = None # Construct the basic representation ssm = KalmanSmoother(k_endog=1, k_states=1, k_posdef=1) ssm.bind(endog) init = Initialization(ssm.k_states, initialization_type='diffuse') ssm.initialize(init) # ssm.filter_univariate = True # should not be required # Fill in the system matrices for a local level model ssm['design', :] = 1 ssm['obs_cov', :] = sigma2_y ssm['transition', :] = 1 ssm['selection', :] = 1 ssm['state_cov', :] = sigma2_mu else: mod = UnobservedComponents(endog, 'llevel') mod.update(params) ssm = mod.ssm ssm.initialize(Initialization(ssm.k_states, 'diffuse')) return mod, ssm
def test_irrelevant_state(): # This test records a case in which exact diffuse initialization leads to # numerical problems, becuase the existence of an irrelevant state # initialized as diffuse means that there is never a transition to the # usual Kalman filter. endog = macrodata.infl spec = { 'freq_seasonal': [{'period':8, 'harmonics': 6}, {'period': 36, 'harmonics': 6}] } # Approximate diffuse version mod = UnobservedComponents(endog, 'llevel', **spec) mod.ssm.initialization = Initialization(mod.k_states,'approximate_diffuse') res = mod.smooth([3.4, 7.2, 0.01, 0.01]) # Exact diffuse version mod2 = UnobservedComponents(endog, 'llevel', **spec) mod2.ssm.filter_univariate = True mod2.ssm.initialization = Initialization(mod2.k_states, 'diffuse') res2 = mod2.smooth([3.4, 7.2, 0.01, 0.01]) # Check that e.g. the filtered state for the level is equal assert_allclose(res.filtered_state[0, 25:], res2.filtered_state[0, 25:], atol=1e-5)
def test_forecast(): endog = np.arange(50) + 10 exog = np.arange(50) mod = UnobservedComponents(endog, exog=exog, level='dconstant') res = mod.smooth([1e-15, 1]) actual = res.forecast(10, exog=np.arange(50,60)[:,np.newaxis]) desired = np.arange(50,60) + 10 assert_allclose(actual, desired)
def test_mle_reg(): endog = np.arange(100)*1.0 exog = endog*2 # Make the fit not-quite-perfect endog[::2] += 0.01 endog[1::2] -= 0.01 with warnings.catch_warnings(record=True) as w: mod1 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=False) res1 = mod1.fit(disp=-1) mod2 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=True) res2 = mod2.fit(disp=-1) assert_allclose(res1.regression_coefficients.filtered[0, -1], 0.5, atol=1e-5) assert_allclose(res2.params[1], 0.5, atol=1e-5)
def test_start_params(): # Test that the behavior is correct for multiple exogenous and / or # autoregressive components # Parameters nobs = int(1e4) beta = np.r_[10, -2] phi = np.r_[0.5, 0.1] # Generate data np.random.seed(1234) exog = np.c_[np.ones(nobs), np.arange(nobs)*1.0] eps = np.random.normal(size=nobs) endog = np.zeros(nobs+2) for t in range(1, nobs): endog[t+1] = phi[0] * endog[t] + phi[1] * endog[t-1] + eps[t] endog = endog[2:] endog += np.dot(exog, beta) # Now just test that the starting parameters are approximately what they # ought to be (could make this arbitrarily precise by increasing nobs, # but that would slow down the test for no real gain) mod = UnobservedComponents(endog, exog=exog, autoregressive=2) assert_allclose(mod.start_params, [1., 0.5, 0.1, 10, -2], atol=1e-1)
def test_mle_reg(): endog = np.arange(100)*1.0 exog = endog*2 # Make the fit not-quite-perfect endog[::2] += 0.01 endog[1::2] -= 0.01 with warnings.catch_warnings(record=True): mod1 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=False) res1 = mod1.fit(disp=-1) mod2 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=True) res2 = mod2.fit(disp=-1) assert_allclose(res1.regression_coefficients.filtered[0, -1], 0.5, atol=1e-5) assert_allclose(res2.params[1], 0.5, atol=1e-5)
def test_apply_results(): endog = np.arange(100) exog = np.ones_like(endog) params = [1., 1., 0.1, 1.] mod1 = UnobservedComponents(endog[:50], 'llevel', exog=exog[:50]) res1 = mod1.smooth(params) mod2 = UnobservedComponents(endog[50:], 'llevel', exog=exog[50:]) res2 = mod2.smooth(params) res3 = res2.apply(endog[:50], exog=exog[:50]) assert_equal(res1.specification, res3.specification) for attr in [ 'nobs', 'llf', 'llf_obs', 'loglikelihood_burn', 'cov_params_default' ]: assert_equal(getattr(res3, attr), getattr(res1, attr)) for attr in [ 'filtered_state', 'filtered_state_cov', 'predicted_state', 'predicted_state_cov', 'forecasts', 'forecasts_error', 'forecasts_error_cov', 'standardized_forecasts_error', 'forecasts_error_diffuse_cov', 'predicted_diffuse_state_cov', 'scaled_smoothed_estimator', 'scaled_smoothed_estimator_cov', 'smoothing_error', 'smoothed_state', 'smoothed_state_cov', 'smoothed_state_autocov', 'smoothed_measurement_disturbance', 'smoothed_state_disturbance', 'smoothed_measurement_disturbance_cov', 'smoothed_state_disturbance_cov' ]: assert_equal(getattr(res3, attr), getattr(res1, attr)) assert_allclose(res3.forecast(10, exog=np.ones(10)), res1.forecast(10, exog=np.ones(10)))
def test_mle_reg(use_exact_diffuse): endog = np.arange(100) * 1.0 exog = endog * 2 # Make the fit not-quite-perfect endog[::2] += 0.01 endog[1::2] -= 0.01 with warnings.catch_warnings(record=True): mod1 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=False, use_exact_diffuse=use_exact_diffuse) res1 = mod1.fit(disp=-1) mod2 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=True, use_exact_diffuse=use_exact_diffuse) res2 = mod2.fit(disp=-1) assert_allclose(res1.regression_coefficients.filtered[0, -1], 0.5, atol=1e-5) assert_allclose(res2.params[1], 0.5, atol=1e-5) # When the regression component is part of the state vector with exact # diffuse initialization, we have two diffuse observations if use_exact_diffuse: print(res1.predicted_diffuse_state_cov) assert_equal(res1.nobs_diffuse, 2) assert_equal(res2.nobs_diffuse, 0) else: assert_equal(res1.loglikelihood_burn, 1) assert_equal(res2.loglikelihood_burn, 0)
def test_recreate_model(): nobs = 100 endog = np.ones(nobs) * 2.0 exog = np.ones(nobs) levels = [ 'irregular', 'ntrend', 'fixed intercept', 'deterministic constant', 'dconstant', 'local level', 'llevel', 'random walk', 'rwalk', 'fixed slope', 'deterministic trend', 'dtrend', 'local linear deterministic trend', 'lldtrend', 'random walk with drift', 'rwdrift', 'local linear trend', 'lltrend', 'smooth trend', 'strend', 'random trend', 'rtrend'] for level in levels: # Note: have to add in some stochastic component, otherwise we have # problems with entirely deterministic models # level + stochastic seasonal mod = UnobservedComponents(endog, level=level, seasonal=2, stochastic_seasonal=True, exog=exog) mod2 = UnobservedComponents(endog, exog=exog, **mod._get_init_kwds()) check_equivalent_models(mod, mod2) # level + autoregressive mod = UnobservedComponents(endog, level=level, exog=exog, autoregressive=1) mod2 = UnobservedComponents(endog, exog=exog, **mod._get_init_kwds()) check_equivalent_models(mod, mod2) # level + stochastic cycle mod = UnobservedComponents(endog, level=level, exog=exog, cycle=True, stochastic_cycle=True, damped_cycle=True) mod2 = UnobservedComponents(endog, exog=exog, **mod._get_init_kwds()) check_equivalent_models(mod, mod2)
def test_matrices_somewhat_complicated_model(): values = dta.copy() model = UnobservedComponents(values['unemp'], level='lltrend', freq_seasonal=[{'period': 4}, {'period': 9, 'harmonics': 3}], cycle=True, cycle_period_bounds=[2, 30], damped_cycle=True, stochastic_freq_seasonal=[True, False], stochastic_cycle=True ) # Selected parameters params = [1, # irregular_var 3, 4, # lltrend parameters: level_var, trend_var 5, # freq_seasonal parameters: freq_seasonal_var_0 # cycle parameters: cycle_var, cycle_freq, cycle_damp 6, 2*np.pi/30., .9 ] model.update(params) # Check scalar properties assert_equal(model.k_states, 2 + 4 + 6 + 2) assert_equal(model.k_state_cov, 2 + 1 + 0 + 1) assert_equal(model.loglikelihood_burn, 2 + 4 + 6 + 2) assert_allclose(model.ssm.k_posdef, 2 + 4 + 0 + 2) assert_equal(model.k_params, len(params)) # Check the statespace model matrices against hand-constructed answers # We group the terms by the component expected_design = np.r_[[1, 0], [1, 0, 1, 0], [1, 0, 1, 0, 1, 0], [1, 0]].reshape(1, 14) assert_allclose(model.ssm.design[:, :, 0], expected_design) expected_transition = __direct_sum([ np.array([[1, 1], [0, 1]]), np.array([[0, 1, 0, 0], [-1, 0, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]]), np.array([[np.cos(2*np.pi*1/9.), np.sin(2*np.pi*1/9.), 0, 0, 0, 0], [-np.sin(2*np.pi*1/9.), np.cos(2*np.pi*1/9.), 0, 0, 0, 0], [0, 0, np.cos(2*np.pi*2/9.), np.sin(2*np.pi*2/9.), 0, 0], [0, 0, -np.sin(2*np.pi*2/9.), np.cos(2*np.pi*2/9.), 0, 0], [0, 0, 0, 0, np.cos(2*np.pi/3.), np.sin(2*np.pi/3.)], [0, 0, 0, 0, -np.sin(2*np.pi/3.), np.cos(2*np.pi/3.)]]), np.array([[.9*np.cos(2*np.pi/30.), .9*np.sin(2*np.pi/30.)], [-.9*np.sin(2*np.pi/30.), .9*np.cos(2*np.pi/30.)]]) ]) assert_allclose( model.ssm.transition[:, :, 0], expected_transition, atol=1e-7) # Since the second seasonal term is not stochastic, # the dimensionality of the state disturbance is 14 - 6 = 8 expected_selection = np.zeros((14, 14 - 6)) expected_selection[0:2, 0:2] = np.eye(2) expected_selection[2:6, 2:6] = np.eye(4) expected_selection[-2:, -2:] = np.eye(2) assert_allclose(model.ssm.selection[:, :, 0], expected_selection) expected_state_cov = __direct_sum([ np.diag(params[1:3]), np.eye(4)*params[3], np.eye(2)*params[4] ]) assert_allclose(model.ssm.state_cov[:, :, 0], expected_state_cov)
def run_ucm(name): true = getattr(results_structural, name) for model in true['models']: kwargs = model.copy() kwargs.update(true['kwargs']) # Make a copy of the data values = dta.copy() freq = kwargs.pop('freq', None) if freq is not None: values.index = pd.date_range(start='1959-01-01', periods=len(dta), freq=freq) # Test pandas exog if 'exog' in kwargs: # Default value here is pd.Series object exog = np.log(values['realgdp']) # Also allow a check with a 1-dim numpy array if kwargs['exog'] == 'numpy': exog = exog.values.squeeze() kwargs['exog'] = exog # Create the model mod = UnobservedComponents(values['unemp'], **kwargs) # Smoke test for starting parameters, untransform, transform # Also test that transform and untransform are inverses mod.start_params roundtrip = mod.transform_params( mod.untransform_params(mod.start_params)) assert_allclose(mod.start_params, roundtrip) # Fit the model at the true parameters res_true = mod.filter(true['params']) # Check that the cycle bounds were computed correctly freqstr = freq[0] if freq is not None else values.index.freqstr[0] if 'cycle_period_bounds' in kwargs: cycle_period_bounds = kwargs['cycle_period_bounds'] elif freqstr == 'A': cycle_period_bounds = (1.5, 12) elif freqstr == 'Q': cycle_period_bounds = (1.5*4, 12*4) elif freqstr == 'M': cycle_period_bounds = (1.5*12, 12*12) else: # If we have no information on data frequency, require the # cycle frequency to be between 0 and pi cycle_period_bounds = (2, np.inf) # Test that the cycle frequency bound is correct assert_equal(mod.cycle_frequency_bound, (2*np.pi / cycle_period_bounds[1], 2*np.pi / cycle_period_bounds[0])) # Test that the likelihood is correct rtol = true.get('rtol', 1e-7) atol = true.get('atol', 0) assert_allclose(res_true.llf, true['llf'], rtol=rtol, atol=atol) # Optional smoke test for plot_components try: import matplotlib.pyplot as plt try: from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() except ImportError: pass fig = plt.figure() res_true.plot_components(fig=fig) except ImportError: pass # Now fit the model via MLE with warnings.catch_warnings(record=True): res = mod.fit(disp=-1) # If we found a higher likelihood, no problem; otherwise check # that we're very close to that found by R if res.llf <= true['llf']: assert_allclose(res.llf, true['llf'], rtol=1e-4) # Smoke test for summary res.summary()
def test_custom_model_fit(rand_data, pre_int_period, post_int_period, monkeypatch): fit_mock = mock.Mock() monkeypatch.setattr( 'causalimpact.main.CausalImpact._process_posterior_inferences', mock.Mock()) pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1], :] model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel', exog=pre_data.iloc[:, 1:]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None), (None, None)], disp=False, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None), (None, None)], disp=True, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True, prior_level_sd=0.01) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None), (None, None)], disp=True, prior_level_sd=0.01, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True, prior_level_sd=None) fit_mock.assert_called_with(bounds=[(None, None), (None, None), (None, None), (None, None)], disp=True, prior_level_sd=None, nseasons=[], standardize=True) model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel', exog=pre_data.iloc[:, 1:], freq_seasonal=[{ 'period': 3 }]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True, prior_level_sd=0.001) fit_mock.assert_called_with(bounds=[ (None, None), (0.001 / 1.2, 0.001 * 1.2), (None, None), (None, None), (None, None) ], disp=True, prior_level_sd=0.001, nseasons=[], standardize=True) model = UnobservedComponents(endog=pre_data.iloc[:, 0], level=True, exog=pre_data.iloc[:, 1], trend=True, seasonal=3, stochastic_level=True) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=True, prior_level_sd=0.001) fit_mock.assert_called_with(bounds=[(0.001 / 1.2, 0.001 * 1.2), (None, None), (None, None)], disp=True, prior_level_sd=0.001, nseasons=[], standardize=True) new_pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1], ['y', 'x1']] model = UnobservedComponents(endog=new_pre_data.iloc[:, 0], level='llevel', exog=new_pre_data.iloc[:, 1:]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=False) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None)], disp=False, nseasons=[], standardize=True) model = UnobservedComponents(endog=new_pre_data.iloc[:, 0], level='dtrend', exog=new_pre_data.iloc[:, 1:]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=False) fit_mock.assert_called_with(bounds=[(None, None), (None, None)], disp=False, nseasons=[], standardize=True) model = UnobservedComponents(endog=new_pre_data.iloc[:, 0], level='lltrend', exog=new_pre_data.iloc[:, 1:]) model.fit = fit_mock CausalImpact(rand_data, pre_int_period, post_int_period, model=model, disp=False) fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2), (None, None), (None, None)], disp=False, nseasons=[], standardize=True)
class CausalImpact: """ Causal inference through counterfactual predictions using a Bayesian structural time-series model. """ def __init__(self, data, inter_date, n_seasons=7): """Main constructor. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date :param int n_seasons: number of seasons in the seasonal component of the BSTS model """ # Constructor arguments self.data = data.reset_index( drop=True) # Input data, with a reset index self.inter_date = inter_date # Date of intervention as passed in input self.n_seasons = n_seasons # Number of seasons in the seasonal component of the BSTS model # DataFrame holding the results of the BSTS model predictions. self.result = None # Private attributes for modeling purposes only self._input_index = data.index # Input data index self._inter_index = None # Data intervention date, relative to the reset index self._model = None # statsmodels BSTS model self._fit = None # statsmodels BSTS fitted model # Checking input arguments self._check_input() self._check_model_args() def _check_input(self): """Check input data. """ try: self._inter_index = self._input_index.tolist().index( self.inter_date) except ValueError: raise ValueError( 'Input intervention date could not be found in data index.') self.result = self.data.copy() def _check_model_args(self): """Check if input arguments are compatible with the data. """ if self.n_seasons < 2: raise ValueError( 'Seasonal component must have a seasonal period of at least 2.' ) if self._inter_index < self.n_seasons: raise ValueError( 'Training data contains more samples than number of seasons in BSTS model.' ) def run(self, max_iter=1000, return_df=False): """Fit the BSTS model to the data. :param int max_iter: max number of iterations in UnobservedComponents.fit (maximum likelihood estimator) :param bool return_df: set to `True` if you want this method to return the dataframe of model results :return: None or pandas.DataFrame of results """ self._model = UnobservedComponents( self.data.loc[:self._inter_index - 1, self._obs_col()].values, exog=self.data.loc[:self._inter_index - 1, self._reg_cols()].values, level='local linear trend', seasonal=self.n_seasons, ) self._fit = self._model.fit(maxiter=max_iter) self._get_estimates() self._get_difference_estimates() self._get_cumulative_estimates() if return_df: return self.result def _get_estimates(self): """Extracting model estimate (before and after intervention) as well as 95% confidence interval. """ lpred = self._fit.get_prediction( ) # Left: model before date of intervention (allows to evaluate fit quality) rpred = self._fit.get_forecast( # Right: best prediction of y without any intervention steps=self.data.shape[0] - self._inter_index, exog=self.data.loc[self._inter_index:, self._reg_cols()]) # Model prediction self.result = self.result.assign( pred=np.concatenate([lpred.predicted_mean, rpred.predicted_mean])) # 95% confidence interval lower_conf_ints = [] upper_conf_ints = [] for pred in [lpred, rpred]: conf_int = pred.conf_int() if isinstance( conf_int, np.ndarray ): # As of 0.9.0, statsmodels returns a np.ndarray here lower_conf_ints.append(conf_int[:, 0]) upper_conf_ints.append(conf_int[:, 1]) else: # instead of a dataframe with "lower y" and "upper y" columns lower_conf_ints.append(conf_int.loc[:, 'lower y'].values) upper_conf_ints.append(conf_int.loc[:, 'upper y'].values) self.result = self.result.assign( pred_conf_int_lower=np.concatenate(lower_conf_ints)) self.result = self.result.assign( pred_conf_int_upper=np.concatenate(upper_conf_ints)) def _get_difference_estimates(self): """Extracting the difference between the model prediction and the actuals, as well as the related 95% confidence interval. """ # Difference between actuals and model self.result = self.result.assign( pred_diff=self.data[self._obs_col()].values - self.result['pred']) # Confidence interval of the difference self.result = self.result.assign( pred_diff_conf_int_lower=self.data[self._obs_col()] - self.result['pred_conf_int_upper']) self.result = self.result.assign( pred_diff_conf_int_upper=self.data[self._obs_col()] - self.result['pred_conf_int_lower']) def _get_cumulative_estimates(self): """Extracting estimate of the cumulative impact of the intervention, and its 95% confidence interval. """ # Cumulative sum of modeled impact self.result = self.result.assign(cum_impact=0) self.result.loc[self._inter_index:, 'cum_impact'] = ( self.data[self._obs_col()] - self.result['pred']).loc[self._inter_index:].cumsum() # Confidence interval of the cumulative sum radius_cumsum = np.sqrt( ((self.result['pred'] - self.result['pred_conf_int_lower'] ).loc[self._inter_index:]**2).cumsum()) self.result = self.result.assign(cum_impact_conf_int_lower=0, cum_impact_conf_int_upper=0) self.result.loc[self._inter_index:, 'cum_impact_conf_int_lower'] = \ self.result['cum_impact'].loc[self._inter_index:] - radius_cumsum self.result.loc[self._inter_index:, 'cum_impact_conf_int_upper'] = \ self.result['cum_impact'].loc[self._inter_index:] + radius_cumsum def _obs_col(self): """Get name of column to be modeled in input data. :return: column name :rtype: str """ return 'y' def _reg_cols(self): """Get names of columns used in the regression component of the model. :return: the column names :rtype: pandas.indexes.base.Index """ return self.data.columns.difference([self._obs_col()]) def plot_components(self): """Plot the estimated components of the model. """ self._fit.plot_components(figsize=(15, 9), legend_loc='lower right') plt.show() def plot(self, split=False): """Produce final impact plots. Note: the first few observations are not shown due to approximate diffuse initialization. :param bool split: set to `True` if you want to split plot of input data into multiple charts. Default: `False`. """ min_t = 2 if self.n_seasons is None else self.n_seasons + 1 n_plots = 3 + split * len(self._reg_cols()) grid = gs.GridSpec(n_plots, 1) plt.figure(figsize=(15, 4 * n_plots)) # Observation and regression components ax1 = plt.subplot(grid[0, :]) # Regression components for i, col in enumerate(self._reg_cols()): plt.plot(self.data[col], label=col) if split: # Creating new subplot if charts should be split plt.axvline(self._inter_index, c='k', linestyle='--') plt.title(col) ax = plt.subplot(grid[i + 1, :], sharex=ax1) plt.setp(ax.get_xticklabels(), visible=False) # Model and confidence intervals plt.plot(self.result['pred'].iloc[min_t:], 'r--', linewidth=2, label='model') plt.plot(self.data[self._obs_col()], 'k', linewidth=2, label=self._obs_col()) plt.axvline(self._inter_index, c='k', linestyle='--') plt.fill_between( self.data.index[min_t:], self.result['pred_conf_int_lower'].iloc[min_t:], self.result['pred_conf_int_upper'].iloc[min_t:], facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax1.get_xticklabels(), visible=False) plt.legend(loc='upper left') plt.title('Observation vs prediction') # Pointwise difference ax2 = plt.subplot(grid[-2, :], sharex=ax1) plt.plot(self.result['pred_diff'].iloc[min_t:], 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self._inter_index, c='k', linestyle='--') plt.fill_between( self.data.index[min_t:], self.result['pred_diff_conf_int_lower'].iloc[min_t:], self.result['pred_diff_conf_int_upper'].iloc[min_t:], facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax2.get_xticklabels(), visible=False) plt.title('Difference') # Cumulative impact ax3 = plt.subplot(grid[-1, :], sharex=ax1) plt.plot(self.data.index, self.result['cum_impact'], 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self._inter_index, c='k', linestyle='--') plt.fill_between( self.data.index, self.result['cum_impact_conf_int_lower'], self.result['cum_impact_conf_int_upper'], facecolor='gray', interpolate=True, alpha=0.25, ) plt.axis([self.data.index[0], self.data.index[-1], None, None]) ax3.set_xticklabels(self._input_index, rotation=45) plt.locator_params(axis='x', nbins=min(12, self.data.shape[0])) plt.title('Cumulative Impact') plt.xlabel('$T$') plt.show()
class CausalImpact: """ Causal inference through counterfactual predictions using a Bayesian structural time-series model. """ def __init__(self, data, inter_date, model_args=None): """Main constructor. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date :param {str: object} model_args: parameters of the model > max_iter: number of samples in the MCMC sampling > n_seasons: number of seasons in the seasonal component of the BSTS model """ self.data = None # Input data, with a reset index self.data_index = None # Data initial index self.data_inter = None # Data intervention date, relative to the reset index self.model = None # statsmodels BSTS model self.fit = None # statsmodels BSTS fitted model self.model_args = None # BSTS model arguments # Checking input arguments self._check_input(data, inter_date) self._check_model_args(model_args) def run(self): """Fit the BSTS model to the data. """ self.model = UnobservedComponents( self.data.loc[:self.data_inter - 1, self._obs_col()].values, exog=self.data.loc[:self.data_inter - 1, self._reg_cols()].values, level='local linear trend', seasonal=self.model_args['n_seasons'], ) self.fit = self.model.fit( maxiter=self.model_args['max_iter'], ) def _check_input(self, data, inter_date): """Check input data. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date """ self.data_index = data.index self.data = data.reset_index(drop=True) try: self.data_inter = self.data_index.tolist().index(inter_date) except ValueError: raise ValueError('Input intervention date could not be found in data index.') def _check_model_args(self, model_args): """Check input arguments, and add missing ones if needed. :return: the valid dict of arguments :rtype: {str: object} """ if model_args is None: model_args = {} for key, val in DEFAULT_ARGS.items(): if key not in model_args: model_args[key] = val self.model_args = model_args def _obs_col(self): """Get name of column to be modeled in input data. :return: column name :rtype: str """ return 'y' def _reg_cols(self): """Get names of columns used in the regression component of the model. :return: the column names :rtype: pandas.indexes.base.Index """ return self.data.columns.difference([self._obs_col()]) def plot_components(self): """Plot the estimated components of the model. """ self.fit.plot_components(figsize=(15, 9), legend_loc='lower right') plt.show() def plot(self): """Produce final impact plots. """ min_t = 2 if self.model_args['n_seasons'] is None else self.model_args['n_seasons'] + 1 # Data model before date of intervention - allows to evaluate quality of fit pred = self.fit.get_prediction() pre_model = pred.predicted_mean pre_lower = pred.conf_int()['lower y'].values pre_upper = pred.conf_int()['upper y'].values pre_model[:min_t] = np.nan pre_lower[:min_t] = np.nan pre_upper[:min_t] = np.nan # Best prediction of y without any intervention post_pred = self.fit.get_forecast( steps=self.data.shape[0] - self.data_inter, exog=self.data.loc[self.data_inter:, self._reg_cols()] ) post_model = post_pred.predicted_mean post_lower = post_pred.conf_int()['lower y'].values post_upper = post_pred.conf_int()['upper y'].values plt.figure(figsize=(15, 12)) # Observation and regression components ax1 = plt.subplot(3, 1, 1) for col in self._reg_cols(): plt.plot(self.data[col], label=col) plt.plot(np.concatenate([pre_model, post_model]), 'r--', linewidth=2, label='model') plt.plot(self.data[self._obs_col()], 'k', linewidth=2, label=self._obs_col()) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.loc[:self.data_inter - 1].index, pre_lower, pre_upper, facecolor='gray', interpolate=True, alpha=0.25, ) plt.fill_between( self.data.loc[self.data_inter:].index, post_lower, post_upper, facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax1.get_xticklabels(), visible=False) plt.legend(loc='upper left') plt.title('Observation vs prediction') # Pointwise difference ax2 = plt.subplot(312, sharex=ax1) plt.plot(self.data[self._obs_col()] - np.concatenate([pre_model, post_model]), 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.loc[:self.data_inter - 1].index, self.data.loc[:self.data_inter - 1, self._obs_col()] - pre_lower, self.data.loc[:self.data_inter - 1, self._obs_col()] - pre_upper, facecolor='gray', interpolate=True, alpha=0.25, ) plt.fill_between( self.data.loc[self.data_inter:].index, self.data.loc[self.data_inter:, self._obs_col()] - post_lower, self.data.loc[self.data_inter:, self._obs_col()] - post_upper, facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax2.get_xticklabels(), visible=False) plt.title('Difference') # Cumulative impact ax3 = plt.subplot(313, sharex=ax1) plt.plot( self.data.loc[self.data_inter:].index, (self.data.loc[self.data_inter:, self._obs_col()] - post_model).cumsum(), 'r--', linewidth=2, ) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.loc[self.data_inter:].index, (self.data.loc[self.data_inter:, self._obs_col()] - post_lower).cumsum(), (self.data.loc[self.data_inter:, self._obs_col()] - post_upper).cumsum(), facecolor='gray', interpolate=True, alpha=0.25, ) plt.axis([self.data.index[0], self.data.index[-1], None, None]) ax3.set_xticklabels(self.data_index) plt.title('Cumulative Impact') plt.xlabel('$T$') plt.show() print('Note: the first {} observations are not shown, due to approximate diffuse initialization'.format(min_t)) def summary_forecast(self)
def run_ucm(name): true = getattr(results_structural, name) for model in true['models']: kwargs = model.copy() kwargs.update(true['kwargs']) # Make a copy of the data values = dta.copy() freq = kwargs.pop('freq', None) if freq is not None: values.index = pd.date_range(start='1959-01-01', periods=len(dta), freq=freq) # Test pandas exog if 'exog' in kwargs: # Default value here is pd.Series object exog = np.log(values['realgdp']) # Also allow a check with a 1-dim numpy array if kwargs['exog'] == 'numpy': exog = exog.values.squeeze() kwargs['exog'] = exog # Create the model mod = UnobservedComponents(values['unemp'], **kwargs) # Smoke test for starting parameters, untransform, transform # Also test that transform and untransform are inverses mod.start_params assert_allclose(mod.start_params, mod.transform_params(mod.untransform_params(mod.start_params))) # Fit the model at the true parameters res_true = mod.filter(true['params']) # Check that the cycle bounds were computed correctly freqstr = freq[0] if freq is not None else values.index.freqstr[0] if 'cycle_period_bounds' in kwargs: cycle_period_bounds = kwargs['cycle_period_bounds'] elif freqstr == 'A': cycle_period_bounds = (1.5, 12) elif freqstr == 'Q': cycle_period_bounds = (1.5*4, 12*4) elif freqstr == 'M': cycle_period_bounds = (1.5*12, 12*12) else: # If we have no information on data frequency, require the # cycle frequency to be between 0 and pi cycle_period_bounds = (2, np.inf) # Test that the cycle frequency bound is correct assert_equal(mod.cycle_frequency_bound, (2*np.pi / cycle_period_bounds[1], 2*np.pi / cycle_period_bounds[0]) ) # Test that the likelihood is correct rtol = true.get('rtol', 1e-7) atol = true.get('atol', 0) assert_allclose(res_true.llf, true['llf'], rtol=rtol, atol=atol) # Smoke test for plot_components if have_matplotlib: fig = res_true.plot_components() plt.close(fig) # Now fit the model via MLE with warnings.catch_warnings(record=True) as w: res = mod.fit(disp=-1) # If we found a higher likelihood, no problem; otherwise check # that we're very close to that found by R if res.llf <= true['llf']: assert_allclose(res.llf, true['llf'], rtol=1e-4) # Smoke test for summary res.summary()
def test_compile_posterior_inferences_w_data(data): pre_period = [0, 70] post_period = [71, 100] df_pre = data.loc[pre_period[0]:pre_period[1], :] df_post = data.loc[post_period[0]:post_period[1], :] post_period_response = None alpha = 0.05 orig_std_params = (0., 1.) model = UnobservedComponents(endog=df_pre.iloc[:, 0].values, level='llevel', exog=df_pre.iloc[:, 1:].values) trained_model = model.fit() inferences = compile_posterior(trained_model, data, df_pre, df_post, post_period_response, alpha, orig_std_params) expected_response = pd.Series(data.iloc[:, 0], name='response') assert_series_equal(expected_response, inferences['series']['response']) expected_cumsum = pd.Series(np.cumsum(expected_response), name='cum_response') assert_series_equal(expected_cumsum, inferences['series']['cum_response']) predictor = trained_model.get_prediction() forecaster = trained_model.get_forecast( steps=len(df_post), exog=df_post.iloc[:, 1].values.reshape(-1, 1), alpha=alpha) pre_pred = predictor.predicted_mean post_pred = forecaster.predicted_mean point_pred = np.concatenate([pre_pred, post_pred]) expected_point_pred = pd.Series(point_pred, name='point_pred') assert_series_equal(expected_point_pred, inferences['series']['point_pred']) pre_ci = pd.DataFrame(predictor.conf_int(alpha=alpha)) pre_ci.index = df_pre.index post_ci = pd.DataFrame(forecaster.conf_int(alpha=alpha)) post_ci.index = df_post.index ci = pd.concat([pre_ci, post_ci]) expected_pred_upper = ci.iloc[:, 1] expected_pred_upper = expected_pred_upper.rename('point_pred_upper') expected_pred_lower = ci.iloc[:, 0] expected_pred_lower = expected_pred_lower.rename('point_pred_lower') assert_series_equal(expected_pred_upper, inferences['series']['point_pred_upper']) assert_series_equal(expected_pred_lower, inferences['series']['point_pred_lower']) expected_cum_pred = pd.Series(np.cumsum(point_pred), name='cum_pred') assert_series_equal(expected_cum_pred, inferences['series']['cum_pred']) expected_cum_pred_lower = pd.Series(np.cumsum(expected_pred_lower), name='cum_pred_lower') assert_series_equal(expected_cum_pred_lower, inferences['series']['cum_pred_lower']) expected_cum_pred_upper = pd.Series(np.cumsum(expected_pred_upper), name='cum_pred_upper') assert_series_equal(expected_cum_pred_upper, inferences['series']['cum_pred_upper']) expected_point_effect = pd.Series(expected_response - expected_point_pred, name='point_effect') assert_series_equal(expected_point_effect, inferences['series']['point_effect']) expected_point_effect_lower = pd.Series(expected_response - expected_pred_lower, name='point_effect_lower') assert_series_equal(expected_point_effect_lower, inferences['series']['point_effect_lower']) expected_point_effect_upper = pd.Series(expected_response - expected_pred_upper, name='point_effect_upper') assert_series_equal(expected_point_effect_upper, inferences['series']['point_effect_upper']) expected_cum_effect = pd.Series(np.concatenate( (np.zeros(len(df_pre)), np.cumsum(expected_point_effect.iloc[len(df_pre):]))), name='cum_effect') assert_series_equal(expected_cum_effect, inferences['series']['cum_effect']) expected_cum_effect_lower = pd.Series(np.concatenate( (np.zeros(len(df_pre)), np.cumsum(expected_point_effect_lower.iloc[len(df_pre):]))), name='cum_effect_lower') assert_series_equal(expected_cum_effect_lower, inferences['series']['cum_effect_lower']) expected_cum_effect_upper = pd.Series(np.concatenate( (np.zeros(len(df_pre)), np.cumsum(expected_point_effect_upper.iloc[len(df_pre):]))), name='cum_effect_upper') assert_series_equal(expected_cum_effect_upper, inferences['series']['cum_effect_upper'])
def construct_model(data, model_args=None): """Specifies the model and performs inference. Inference means using the data to pass from a prior distribution over parameters and states to a posterior distribution. In a Bayesian framework, estimating a model means to obtain p(parameters | data) from p(data | parameters) and p(parameters). This involves multiplying the prior with the likelihood and normalising the resulting distribution using the marginal likelihood or model evidence, p(data). Computing the evidence poses a virtually intractable high-dimensional integration problem which can be turned into an easier optimization problem using, for instance, an approximate stochastic inference strategy. Here, we use a Markov chain Monte Carlo algorithm, as implemented in the {pymc} package. Args: data: time series of response variable and optional covariates model_args: optional list of additional model arguments Returns: {bsts_model}, as returned by {Bsts()} """ from statsmodels.tsa.statespace.structural import UnobservedComponents # extract y variable y = data.iloc[:, 0] # If the series is ill-conditioned, abort inference observations_ill_conditioned(y) # specification params ss = {} # Local level ss["endog"] = y ss["level"] = "llevel" # Add seasonal component? if model_args["nseasons"] > 1: ss["seasonal_period"] = model_args["season_duration"] # No regression? if len(data.columns) == 1: mod = UnobservedComponents(**ss) return mod else: # Static regression? if not model_args["dynamic_regression"]: ss["exog"] = data.iloc[:, 1:] mod = UnobservedComponents(**ss) return mod # Dynamic regression? else: """Since we have predictor variables in the model, we need to explicitly make their coefficients time-varying using AddDynamicRegression(). In Bsts(), we are therefore not giving a formula but just the response variable. We are then using SdPrior to only specify the prior on the residual standard deviation. prior_mean: precision of random walk of coefficients sigma_mean_prior = gamma_prior(prior_mean=1, a=4) ss = add_dynamic_regression(ss, formula, data=data, sigma_mean_prior=sigma_mean_prior) sd_prior = sd_prior(sigma_guess=model_args["prior_level_sd"] * sdy, upper_limit=0.1 * sdy, sample_size=kDynamicRegressionPriorSampleSize) bsts_model = Bsts(y, state_specification=ss, niter=model_args["niter"], expected_model_size=3, ping=0, seed=1, prior=sd_prior) """ raise NotImplementedError()
def run_ucm(name, use_exact_diffuse=False): true = getattr(results_structural, name) for model in true['models']: kwargs = model.copy() kwargs.update(true['kwargs']) kwargs['use_exact_diffuse'] = use_exact_diffuse # Make a copy of the data values = dta.copy() freq = kwargs.pop('freq', None) if freq is not None: values.index = pd.date_range(start='1959-01-01', periods=len(dta), freq=freq) # Test pandas exog if 'exog' in kwargs: # Default value here is pd.Series object exog = np.log(values['realgdp']) # Also allow a check with a 1-dim numpy array if kwargs['exog'] == 'numpy': exog = exog.values.squeeze() kwargs['exog'] = exog # Create the model mod = UnobservedComponents(values['unemp'], **kwargs) # Smoke test for starting parameters, untransform, transform # Also test that transform and untransform are inverses mod.start_params roundtrip = mod.transform_params( mod.untransform_params(mod.start_params)) assert_allclose(mod.start_params, roundtrip) # Fit the model at the true parameters res_true = mod.filter(true['params']) # Check that the cycle bounds were computed correctly freqstr = freq[0] if freq is not None else values.index.freqstr[0] if 'cycle_period_bounds' in kwargs: cycle_period_bounds = kwargs['cycle_period_bounds'] elif freqstr == 'A': cycle_period_bounds = (1.5, 12) elif freqstr == 'Q': cycle_period_bounds = (1.5 * 4, 12 * 4) elif freqstr == 'M': cycle_period_bounds = (1.5 * 12, 12 * 12) else: # If we have no information on data frequency, require the # cycle frequency to be between 0 and pi cycle_period_bounds = (2, np.inf) # Test that the cycle frequency bound is correct assert_equal(mod.cycle_frequency_bound, (2 * np.pi / cycle_period_bounds[1], 2 * np.pi / cycle_period_bounds[0])) # Test that the likelihood is correct rtol = true.get('rtol', 1e-7) atol = true.get('atol', 0) if use_exact_diffuse: # If we are using exact diffuse initialization, then we need to # adjust for the fact that KFAS does not include the constant in # the likelihood function for the diffuse periods # (see note to test_exact_diffuse_filtering.py for details). res_llf = (res_true.llf_obs.sum() + res_true.nobs_diffuse * 0.5 * np.log(2 * np.pi)) else: # If we are using approximate diffuse initialization, then we need # to ignore the first period, and this will agree with KFAS (since # it does not include the constant in the likelihood function for # diffuse periods). res_llf = res_true.llf_obs[res_true.loglikelihood_burn:].sum() assert_allclose(res_llf, true['llf'], rtol=rtol, atol=atol) # Optional smoke test for plot_components try: import matplotlib.pyplot as plt try: from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() except ImportError: pass fig = plt.figure() res_true.plot_components(fig=fig) except ImportError: pass # Now fit the model via MLE with warnings.catch_warnings(record=True): fit_kwargs = {} if 'maxiter' in true: fit_kwargs['maxiter'] = true['maxiter'] res = mod.fit(start_params=true.get('start_params', None), disp=-1, **fit_kwargs) # If we found a higher likelihood, no problem; otherwise check # that we're very close to that found by R # See note above about these computation if use_exact_diffuse: res_llf = (res.llf_obs.sum() + res.nobs_diffuse * 0.5 * np.log(2 * np.pi)) else: res_llf = res.llf_obs[res_true.loglikelihood_burn:].sum() if res_llf <= true['llf']: assert_allclose(res_llf, true['llf'], rtol=1e-4) # Smoke test for summary res.summary()
class CausalImpact: """ Causal inference through counterfactual predictions using a Bayesian structural time-series model. """ def __init__(self, data, inter_date, model_args=None): """Main constructor. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date :param {str: object} model_args: parameters of the model > max_iter: number of samples in the MCMC sampling > n_seasons: number of seasons in the seasonal component of the BSTS model """ # Publicly exposed attributes self.data = None # Input data, with a reset index self.data_index = None # Data initial index self.data_inter = None # Data intervention date, relative to the reset index self.model_args = None # BSTS model arguments self.result = None # # Private attributes for modeling purposes only self._model = None # statsmodels BSTS model self._fit = None # statsmodels BSTS fitted model # Checking input arguments self._check_input(data, inter_date) self._check_model_args(data, model_args) def _check_input(self, data, inter_date): """Check input data. :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'. See the README for more details. :param object inter_date: date of intervention. Must be of same type of the data index elements. This should usually be int of datetime.date """ self.data_index = data.index self.data = data.reset_index(drop=True) try: self.data_inter = self.data_index.tolist().index(inter_date) except ValueError: raise ValueError('Input intervention date could not be found in data index.') self.result = data.reset_index(drop=False) def _check_model_args(self, data, model_args): """Check input arguments, and add missing ones if needed. :return: the valid dict of arguments :rtype: {str: object} """ if model_args is None: model_args = {} for key, val in DEFAULT_ARGS.items(): if key not in model_args: model_args[key] = val if self.data_inter < model_args['n_seasons']: raise ValueError('Training data contains more samples than number of seasons in BSTS model.') self.model_args = model_args def run(self, return_df=False): """Fit the BSTS model to the data. """ self._model = UnobservedComponents( self.data.loc[:self.data_inter - 1, self._obs_col()].values, exog=self.data.loc[:self.data_inter - 1, self._reg_cols()].values, level='local linear trend', seasonal=self.model_args['n_seasons'], ) self._fit = self._model.fit( maxiter=self.model_args['max_iter'], ) self._get_estimates() self._get_difference_estimates() self._get_cumulative_estimates() if return_df: return self.result def _get_estimates(self): """Extracting model estimate (before and after intervention) as well as 95% confidence interval. """ lpred = self._fit.get_prediction() # Left: model before date of intervention (allows to evaluate fit quality) rpred = self._fit.get_forecast( # Right: best prediction of y without any intervention steps=self.data.shape[0] - self.data_inter, exog=self.data.loc[self.data_inter:, self._reg_cols()] ) # Model prediction self.result = self.result.assign(pred=np.concatenate([lpred.predicted_mean, rpred.predicted_mean])) # 95% confidence interval lower_conf_ints = [] upper_conf_ints = [] for pred in [lpred, rpred]: conf_int = pred.conf_int() if isinstance(conf_int, np.ndarray): # As of 0.9.0, statsmodels returns a np.ndarray here lower_conf_ints.append(conf_int[:, 0]) upper_conf_ints.append(conf_int[:, 1]) else: # instead of a dataframe with "lower y" and "upper y" columns lower_conf_ints.append(conf_int.loc[:, 'lower y'].values) upper_conf_ints.append(conf_int.loc[:, 'upper y'].values) self.result = self.result.assign(pred_conf_int_lower=np.concatenate(lower_conf_ints)) self.result = self.result.assign(pred_conf_int_upper=np.concatenate(upper_conf_ints)) def _get_difference_estimates(self): """Extracting the difference between the model prediction and the actuals, as well as the related 95% confidence interval. """ # Difference between actuals and model self.result = self.result.assign(pred_diff=self.data[self._obs_col()].values - self.result['pred']) # Confidence interval of the difference self.result = self.result.assign( pred_diff_conf_int_lower=self.data[self._obs_col()] - self.result['pred_conf_int_upper'] ) self.result = self.result.assign( pred_diff_conf_int_upper=self.data[self._obs_col()] - self.result['pred_conf_int_lower'] ) def _get_cumulative_estimates(self): """Extracting estimate of the cumulative impact of the intervention, and its 95% confidence interval. """ # Cumulative sum of modeled impact self.result = self.result.assign(cum_impact=0) self.result.loc[self.data_inter:, 'cum_impact'] = ( self.data[self._obs_col()] - self.result['pred'] ).loc[self.data_inter:].cumsum() # Confidence interval of the cumulative sum radius_cumsum = np.sqrt( ((self.result['pred'] - self.result['pred_conf_int_lower']).loc[self.data_inter:] ** 2).cumsum() ) self.result = self.result.assign(cum_impact_conf_int_lower=0, cum_impact_conf_int_upper=0) self.result.loc[self.data_inter:, 'cum_impact_conf_int_lower'] = \ self.result['cum_impact'].loc[self.data_inter:] - radius_cumsum self.result.loc[self.data_inter:, 'cum_impact_conf_int_upper'] = \ self.result['cum_impact'].loc[self.data_inter:] + radius_cumsum def _obs_col(self): """Get name of column to be modeled in input data. :return: column name :rtype: str """ return 'y' def _reg_cols(self): """Get names of columns used in the regression component of the model. :return: the column names :rtype: pandas.indexes.base.Index """ return self.data.columns.difference([self._obs_col()]) def plot_components(self): """Plot the estimated components of the model. """ self._fit.plot_components(figsize=(15, 9), legend_loc='lower right') plt.show() def plot(self): """Produce final impact plots. Note: the first few observations are not shown due to approximate diffuse initialization. """ min_t = 2 if self.model_args['n_seasons'] is None else self.model_args['n_seasons'] + 1 plt.figure(figsize=(15, 12)) # Observation and regression components ax1 = plt.subplot(3, 1, 1) for col in self._reg_cols(): plt.plot(self.data[col], label=col) plt.plot(self.result['pred'].iloc[min_t:], 'r--', linewidth=2, label='model') plt.plot(self.data[self._obs_col()], 'k', linewidth=2, label=self._obs_col()) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.index[min_t:], self.result['pred_conf_int_lower'].iloc[min_t:], self.result['pred_conf_int_upper'].iloc[min_t:], facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax1.get_xticklabels(), visible=False) plt.legend(loc='upper left') plt.title('Observation vs prediction') # Pointwise difference ax2 = plt.subplot(312, sharex=ax1) plt.plot(self.result['pred_diff'].iloc[min_t:], 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.index[min_t:], self.result['pred_diff_conf_int_lower'].iloc[min_t:], self.result['pred_diff_conf_int_upper'].iloc[min_t:], facecolor='gray', interpolate=True, alpha=0.25, ) plt.setp(ax2.get_xticklabels(), visible=False) plt.title('Difference') # Cumulative impact ax3 = plt.subplot(313, sharex=ax1) plt.plot(self.data.index, self.result['cum_impact'], 'r--', linewidth=2) plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2) plt.axvline(self.data_inter, c='k', linestyle='--') plt.fill_between( self.data.index, self.result['cum_impact_conf_int_lower'], self.result['cum_impact_conf_int_upper'], facecolor='gray', interpolate=True, alpha=0.25, ) plt.axis([self.data.index[0], self.data.index[-1], None, None]) ax3.set_xticklabels(self.data_index, rotation=45) plt.locator_params(axis='x', nbins=min(12, self.data.shape[0])) plt.title('Cumulative Impact') plt.xlabel('$T$') plt.show()
def test_matrices_somewhat_complicated_model(): values = dta.copy() model = UnobservedComponents(values['unemp'], level='lltrend', freq_seasonal=[{'period': 4}, {'period': 9, 'harmonics': 3}], cycle=True, cycle_period_bounds=[2, 30], damped_cycle=True, stochastic_freq_seasonal=[True, False], stochastic_cycle=True ) # Selected parameters params = [1, # irregular_var 3, 4, # lltrend parameters: level_var, trend_var 5, # freq_seasonal parameters: freq_seasonal_var_0 # cycle parameters: cycle_var, cycle_freq, cycle_damp 6, 2*np.pi/30., .9 ] model.update(params) # Check scalar properties assert_equal(model.k_states, 2 + 4 + 6 + 2) assert_equal(model.k_state_cov, 2 + 1 + 0 + 1) assert_equal(model.loglikelihood_burn, 2 + 4 + 6 + 2) assert_allclose(model.ssm.k_posdef, 2 + 4 + 0 + 2) assert_equal(model.k_params, len(params)) # Check the statespace model matrices against hand-constructed answers # We group the terms by the component expected_design = np.r_[[1, 0], [1, 0, 1, 0], [1, 0, 1, 0, 1, 0], [1, 0]].reshape(1, 14) assert_allclose(model.ssm.design[:, :, 0], expected_design) expected_transition = __direct_sum([ np.array([[1, 1], [0, 1]]), np.array([[0, 1, 0, 0], [-1, 0, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]]), np.array([[np.cos(2*np.pi*1/9.), np.sin(2*np.pi*1/9.), 0, 0, 0, 0], [-np.sin(2*np.pi*1/9.), np.cos(2*np.pi*1/9.), 0, 0, 0, 0], [0, 0, np.cos(2*np.pi*2/9.), np.sin(2*np.pi*2/9.), 0, 0], [0, 0, -np.sin(2*np.pi*2/9.), np.cos(2*np.pi*2/9.), 0, 0], [0, 0, 0, 0, np.cos(2*np.pi/3.), np.sin(2*np.pi/3.)], [0, 0, 0, 0, -np.sin(2*np.pi/3.), np.cos(2*np.pi/3.)]]), np.array([[.9*np.cos(2*np.pi/30.), .9*np.sin(2*np.pi/30.)], [-.9*np.sin(2*np.pi/30.), .9*np.cos(2*np.pi/30.)]]) ]) assert_allclose( model.ssm.transition[:, :, 0], expected_transition, atol=1e-7) # Since the second seasonal term is not stochastic, # the dimensionality of the state disturbance is 14 - 6 = 8 expected_selection = np.zeros((14, 14 - 6)) expected_selection[0:2, 0:2] = np.eye(2) expected_selection[2:6, 2:6] = np.eye(4) expected_selection[-2:, -2:] = np.eye(2) assert_allclose(model.ssm.selection[:, :, 0], expected_selection) expected_state_cov = __direct_sum([ np.diag(params[1:3]), np.eye(4)*params[3], np.eye(2)*params[4] ]) assert_allclose(model.ssm.state_cov[:, :, 0], expected_state_cov)
def fit(self, X, y=None): # Perform top percentile ceiling self.X = X mode = self.mode if '*f' in self.mode: self.X = np.minimum(X, np.percentile(X, 75)) mode = self.mode.partition('*f')[0] # Perform transformation if specified by *transformation if '*ln' in self.mode: self.X = np.log(np.array(X) + 1) mode = self.mode.partition('*ln')[0] elif '*bc' in self.mode: transformer = pm.preprocessing.BoxCoxEndogTransformer() self.X = transformer.fit_transform(y=X) self.transformer = transformer mode = self.mode.partition('*bc')[0] try: if mode == 'll': # Local Level model = LocalLevel(self.X) self.res_ = model.fit(disp=False) self.k_exog = None elif mode == 'lla': endog = X[2:] exog = np.column_stack((X[1:-1], X[:-2])) self.k_exog = exog.shape[1] model = UnobservedComponents(endog=endog, exog=exog, level='local level') self.res_ = model.fit(disp=False) elif mode == 'lls': self.k_exog = None model = SARIMAX(endog=self.X, order=(2, 0, 0), trend='c', measurement_error=True) self.res_ = model.fit(disp=False) elif mode == 'llt': # Local Linear Trend model = UnobservedComponents(endog=self.X, level='local linear trend') self.res_ = model.fit(disp=False) elif mode == 'llc': # Local Level Cycle model = UnobservedComponents(endog=self.X, level='local level', cycle=True, stochastic_cycle=True) self.res_ = model.fit(disp=False) elif mode == 'arima': self.res_ = pm.auto_arima(self.X, start_p=1, start_q=1, start_P=1, start_Q=1, max_p=5, max_q=5, max_P=5, max_Q=5, seasonal=True, stepwise=True, suppress_warnings=True, D=10, max_D=10, error_action='ignore') elif mode == 'rw1': # For RW model self.res_ = None self.converged = False except np.linalg.LinAlgError: # Some kalman filter error ==> Use random walk print(f'Convergence failed for {mode}') self.converged = False return self try: self.converged = self.res_.mle_retvals['converged'] except AttributeError: if mode == 'arima': self.converged = True # auto ARIMA from pmdarima should always converge return self
def test_default_model_fit(rand_data, pre_int_period, post_int_period, monkeypatch): pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1], :] fit_mock = mock.Mock() model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel', exog=pre_data.iloc[:, 1:]) model.fit = fit_mock construct_mock = mock.Mock(return_value=model) monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model', construct_mock) monkeypatch.setattr( 'causalimpact.main.CausalImpact._process_posterior_inferences', mock.Mock()) CausalImpact(rand_data, pre_int_period, post_int_period) model.fit.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.012), (None, None), (None, None)], disp=False, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, disp=True) model.fit.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.012), (None, None), (None, None)], disp=True, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, disp=True, prior_level_sd=0.1) model.fit.assert_called_with(bounds=[(None, None), (0.1 / 1.2, 0.1 * 1.2), (None, None), (None, None)], disp=True, prior_level_sd=0.1, nseasons=[], standardize=True) CausalImpact(rand_data, pre_int_period, post_int_period, disp=True, prior_level_sd=None) model.fit.assert_called_with(bounds=[(None, None), (None, None), (None, None), (None, None)], disp=True, prior_level_sd=None, nseasons=[], standardize=True) model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel', exog=pre_data.iloc[:, 1:], freq_seasonal=[{ 'period': 3 }]) model.fit = fit_mock construct_mock = mock.Mock(return_value=model) monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model', construct_mock) CausalImpact(rand_data, pre_int_period, post_int_period, disp=True, prior_level_sd=0.001, nseasons=[{ 'period': 3 }]) model.fit.assert_called_with(bounds=[(None, None), (0.001 / 1.2, 0.001 * 1.2), (None, None), (None, None), (None, None)], disp=True, prior_level_sd=0.001, nseasons=[{ 'period': 3 }], standardize=True) model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel') model.fit = fit_mock construct_mock = mock.Mock(return_value=model) monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model', construct_mock) new_data = pd.DataFrame(np.random.randn(200, 1), columns=['y']) CausalImpact(new_data, pre_int_period, post_int_period, disp=False) model.fit.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2)], disp=False, nseasons=[], standardize=True)