def model_local_linear_trend(endog=None, params=None, direct=False):
    if endog is None:
        y1 = 10.2394
        y2 = 4.2039
        y3 = 6.123123
        endog = np.r_[y1, y2, y3, [1] * 7]
    if params is None:
        params = [1.993, 8.253, 2.334]
    sigma2_y, sigma2_mu, sigma2_beta = params

    if direct:
        mod = None
        # Construct the basic representation
        ssm = KalmanSmoother(k_endog=1, k_states=2, k_posdef=2)
        ssm.bind(endog)
        init = Initialization(ssm.k_states, initialization_type='diffuse')
        ssm.initialize(init)
        # ssm.filter_univariate = True  # should not be required

        # Fill in the system matrices for a local level model
        ssm['design', 0, 0] = 1
        ssm['obs_cov', 0, 0] = sigma2_y
        ssm['transition'] = np.array([[1, 1],
                                      [0, 1]])
        ssm['selection'] = np.eye(2)
        ssm['state_cov'] = np.diag([sigma2_mu, sigma2_beta])
    else:
        mod = UnobservedComponents(endog, 'lltrend')
        mod.update(params)
        ssm = mod.ssm
        ssm.initialize(Initialization(ssm.k_states, 'diffuse'))

    return mod, ssm
def model_local_level(endog=None, params=None, direct=False):
    if endog is None:
        y1 = 10.2394
        endog = np.r_[y1, [1] * 9]
    if params is None:
        params = [1.993, 8.253]
    sigma2_y, sigma2_mu = params

    if direct:
        mod = None
        # Construct the basic representation
        ssm = KalmanSmoother(k_endog=1, k_states=1, k_posdef=1)
        ssm.bind(endog)
        init = Initialization(ssm.k_states, initialization_type='diffuse')
        ssm.initialize(init)
        # ssm.filter_univariate = True  # should not be required

        # Fill in the system matrices for a local level model
        ssm['design', :] = 1
        ssm['obs_cov', :] = sigma2_y
        ssm['transition', :] = 1
        ssm['selection', :] = 1
        ssm['state_cov', :] = sigma2_mu
    else:
        mod = UnobservedComponents(endog, 'llevel')
        mod.update(params)
        ssm = mod.ssm
        ssm.initialize(Initialization(ssm.k_states, 'diffuse'))

    return mod, ssm
def test_irrelevant_state():
    # This test records a case in which exact diffuse initialization leads to
    # numerical problems, becuase the existence of an irrelevant state
    # initialized as diffuse means that there is never a transition to the
    # usual Kalman filter.
    endog = macrodata.infl

    spec = {
        'freq_seasonal': [{'period':8, 'harmonics': 6},
                          {'period': 36, 'harmonics': 6}]
    }

    # Approximate diffuse version
    mod = UnobservedComponents(endog, 'llevel', **spec)
    mod.ssm.initialization = Initialization(mod.k_states,'approximate_diffuse')
    res = mod.smooth([3.4, 7.2, 0.01, 0.01])

    # Exact diffuse version
    mod2 = UnobservedComponents(endog, 'llevel', **spec)
    mod2.ssm.filter_univariate = True
    mod2.ssm.initialization = Initialization(mod2.k_states, 'diffuse')
    res2 = mod2.smooth([3.4, 7.2, 0.01, 0.01])

    # Check that e.g. the filtered state for the level is equal
    assert_allclose(res.filtered_state[0, 25:],
                    res2.filtered_state[0, 25:], atol=1e-5)
Esempio n. 4
0
def test_forecast():
    endog = np.arange(50) + 10
    exog = np.arange(50)

    mod = UnobservedComponents(endog, exog=exog, level='dconstant')
    res = mod.smooth([1e-15, 1])

    actual = res.forecast(10, exog=np.arange(50,60)[:,np.newaxis])
    desired = np.arange(50,60) + 10
    assert_allclose(actual, desired)
Esempio n. 5
0
def test_mle_reg():
    endog = np.arange(100)*1.0
    exog = endog*2
    # Make the fit not-quite-perfect
    endog[::2] += 0.01
    endog[1::2] -= 0.01

    with warnings.catch_warnings(record=True) as w:
        mod1 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=False)
        res1 = mod1.fit(disp=-1)

        mod2 = UnobservedComponents(endog, irregular=True, exog=exog, mle_regression=True)
        res2 = mod2.fit(disp=-1)

    assert_allclose(res1.regression_coefficients.filtered[0, -1], 0.5, atol=1e-5)
    assert_allclose(res2.params[1], 0.5, atol=1e-5)
Esempio n. 6
0
def test_start_params():
    # Test that the behavior is correct for multiple exogenous and / or
    # autoregressive components

    # Parameters
    nobs = int(1e4)
    beta = np.r_[10, -2]
    phi = np.r_[0.5, 0.1]

    # Generate data
    np.random.seed(1234)
    exog = np.c_[np.ones(nobs), np.arange(nobs)*1.0]
    eps = np.random.normal(size=nobs)
    endog = np.zeros(nobs+2)
    for t in range(1, nobs):
        endog[t+1] = phi[0] * endog[t] + phi[1] * endog[t-1] + eps[t]
    endog = endog[2:]
    endog += np.dot(exog, beta)
    
    # Now just test that the starting parameters are approximately what they
    # ought to be (could make this arbitrarily precise by increasing nobs,
    # but that would slow down the test for no real gain)
    mod = UnobservedComponents(endog, exog=exog, autoregressive=2)
    assert_allclose(mod.start_params, [1., 0.5, 0.1, 10, -2], atol=1e-1)
Esempio n. 7
0
def test_mle_reg():
    endog = np.arange(100)*1.0
    exog = endog*2
    # Make the fit not-quite-perfect
    endog[::2] += 0.01
    endog[1::2] -= 0.01

    with warnings.catch_warnings(record=True):
        mod1 = UnobservedComponents(endog, irregular=True,
                                    exog=exog, mle_regression=False)
        res1 = mod1.fit(disp=-1)

        mod2 = UnobservedComponents(endog, irregular=True,
                                    exog=exog, mle_regression=True)
        res2 = mod2.fit(disp=-1)

    assert_allclose(res1.regression_coefficients.filtered[0, -1],
                    0.5,
                    atol=1e-5)
    assert_allclose(res2.params[1], 0.5, atol=1e-5)
def test_apply_results():
    endog = np.arange(100)
    exog = np.ones_like(endog)
    params = [1., 1., 0.1, 1.]

    mod1 = UnobservedComponents(endog[:50], 'llevel', exog=exog[:50])
    res1 = mod1.smooth(params)

    mod2 = UnobservedComponents(endog[50:], 'llevel', exog=exog[50:])
    res2 = mod2.smooth(params)

    res3 = res2.apply(endog[:50], exog=exog[:50])

    assert_equal(res1.specification, res3.specification)

    for attr in [
            'nobs', 'llf', 'llf_obs', 'loglikelihood_burn',
            'cov_params_default'
    ]:
        assert_equal(getattr(res3, attr), getattr(res1, attr))

    for attr in [
            'filtered_state', 'filtered_state_cov', 'predicted_state',
            'predicted_state_cov', 'forecasts', 'forecasts_error',
            'forecasts_error_cov', 'standardized_forecasts_error',
            'forecasts_error_diffuse_cov', 'predicted_diffuse_state_cov',
            'scaled_smoothed_estimator', 'scaled_smoothed_estimator_cov',
            'smoothing_error', 'smoothed_state', 'smoothed_state_cov',
            'smoothed_state_autocov', 'smoothed_measurement_disturbance',
            'smoothed_state_disturbance',
            'smoothed_measurement_disturbance_cov',
            'smoothed_state_disturbance_cov'
    ]:
        assert_equal(getattr(res3, attr), getattr(res1, attr))

    assert_allclose(res3.forecast(10, exog=np.ones(10)),
                    res1.forecast(10, exog=np.ones(10)))
Esempio n. 9
0
def test_mle_reg(use_exact_diffuse):
    endog = np.arange(100) * 1.0
    exog = endog * 2
    # Make the fit not-quite-perfect
    endog[::2] += 0.01
    endog[1::2] -= 0.01

    with warnings.catch_warnings(record=True):
        mod1 = UnobservedComponents(endog,
                                    irregular=True,
                                    exog=exog,
                                    mle_regression=False,
                                    use_exact_diffuse=use_exact_diffuse)
        res1 = mod1.fit(disp=-1)

        mod2 = UnobservedComponents(endog,
                                    irregular=True,
                                    exog=exog,
                                    mle_regression=True,
                                    use_exact_diffuse=use_exact_diffuse)
        res2 = mod2.fit(disp=-1)

    assert_allclose(res1.regression_coefficients.filtered[0, -1],
                    0.5,
                    atol=1e-5)
    assert_allclose(res2.params[1], 0.5, atol=1e-5)

    # When the regression component is part of the state vector with exact
    # diffuse initialization, we have two diffuse observations
    if use_exact_diffuse:
        print(res1.predicted_diffuse_state_cov)
        assert_equal(res1.nobs_diffuse, 2)
        assert_equal(res2.nobs_diffuse, 0)
    else:
        assert_equal(res1.loglikelihood_burn, 1)
        assert_equal(res2.loglikelihood_burn, 0)
Esempio n. 10
0
def test_recreate_model():
    nobs = 100
    endog = np.ones(nobs) * 2.0
    exog = np.ones(nobs)

    levels = [
        'irregular', 'ntrend', 'fixed intercept', 'deterministic constant',
        'dconstant', 'local level', 'llevel', 'random walk', 'rwalk',
        'fixed slope', 'deterministic trend', 'dtrend',
        'local linear deterministic trend', 'lldtrend',
        'random walk with drift', 'rwdrift', 'local linear trend',
        'lltrend', 'smooth trend', 'strend', 'random trend', 'rtrend']

    for level in levels:
        # Note: have to add in some stochastic component, otherwise we have
        # problems with entirely deterministic models

        # level + stochastic seasonal
        mod = UnobservedComponents(endog, level=level, seasonal=2,
                                   stochastic_seasonal=True, exog=exog)
        mod2 = UnobservedComponents(endog, exog=exog, **mod._get_init_kwds())
        check_equivalent_models(mod, mod2)

        # level + autoregressive
        mod = UnobservedComponents(endog, level=level, exog=exog,
                                   autoregressive=1)
        mod2 = UnobservedComponents(endog, exog=exog, **mod._get_init_kwds())
        check_equivalent_models(mod, mod2)

        # level + stochastic cycle
        mod = UnobservedComponents(endog, level=level, exog=exog,
                                   cycle=True, stochastic_cycle=True,
                                   damped_cycle=True)
        mod2 = UnobservedComponents(endog, exog=exog, **mod._get_init_kwds())
        check_equivalent_models(mod, mod2)
Esempio n. 11
0
def test_matrices_somewhat_complicated_model():
    values = dta.copy()

    model = UnobservedComponents(values['unemp'],
                                 level='lltrend',
                                 freq_seasonal=[{'period': 4},
                                                {'period': 9, 'harmonics': 3}],
                                 cycle=True,
                                 cycle_period_bounds=[2, 30],
                                 damped_cycle=True,
                                 stochastic_freq_seasonal=[True, False],
                                 stochastic_cycle=True
                                 )
    # Selected parameters
    params = [1,  # irregular_var
              3, 4,  # lltrend parameters:  level_var, trend_var
              5,   # freq_seasonal parameters: freq_seasonal_var_0
              # cycle parameters: cycle_var, cycle_freq, cycle_damp
              6, 2*np.pi/30., .9
              ]
    model.update(params)

    # Check scalar properties
    assert_equal(model.k_states, 2 + 4 + 6 + 2)
    assert_equal(model.k_state_cov, 2 + 1 + 0 + 1)
    assert_equal(model.loglikelihood_burn, 2 + 4 + 6 + 2)
    assert_allclose(model.ssm.k_posdef, 2 + 4 + 0 + 2)
    assert_equal(model.k_params, len(params))

    # Check the statespace model matrices against hand-constructed answers
    # We group the terms by the component
    expected_design = np.r_[[1, 0],
                            [1, 0, 1, 0],
                            [1, 0, 1, 0, 1, 0],
                            [1, 0]].reshape(1, 14)
    assert_allclose(model.ssm.design[:, :, 0], expected_design)

    expected_transition = __direct_sum([
        np.array([[1, 1],
                  [0, 1]]),
        np.array([[0, 1, 0, 0],
                  [-1, 0, 0, 0],
                  [0, 0, -1,  0],
                  [0, 0,  0, -1]]),
        np.array([[np.cos(2*np.pi*1/9.), np.sin(2*np.pi*1/9.), 0, 0, 0, 0],
                  [-np.sin(2*np.pi*1/9.), np.cos(2*np.pi*1/9.), 0, 0, 0, 0],
                  [0, 0,  np.cos(2*np.pi*2/9.), np.sin(2*np.pi*2/9.), 0, 0],
                  [0, 0, -np.sin(2*np.pi*2/9.), np.cos(2*np.pi*2/9.), 0, 0],
                  [0, 0, 0, 0,  np.cos(2*np.pi/3.), np.sin(2*np.pi/3.)],
                  [0, 0, 0, 0, -np.sin(2*np.pi/3.), np.cos(2*np.pi/3.)]]),
        np.array([[.9*np.cos(2*np.pi/30.), .9*np.sin(2*np.pi/30.)],
                 [-.9*np.sin(2*np.pi/30.), .9*np.cos(2*np.pi/30.)]])
    ])
    assert_allclose(
        model.ssm.transition[:, :, 0], expected_transition, atol=1e-7)

    # Since the second seasonal term is not stochastic,
    # the dimensionality of the state disturbance is 14 - 6 = 8
    expected_selection = np.zeros((14, 14 - 6))
    expected_selection[0:2, 0:2] = np.eye(2)
    expected_selection[2:6, 2:6] = np.eye(4)
    expected_selection[-2:, -2:] = np.eye(2)
    assert_allclose(model.ssm.selection[:, :, 0], expected_selection)

    expected_state_cov = __direct_sum([
        np.diag(params[1:3]),
        np.eye(4)*params[3],
        np.eye(2)*params[4]
    ])
    assert_allclose(model.ssm.state_cov[:, :, 0], expected_state_cov)
Esempio n. 12
0
def run_ucm(name):
    true = getattr(results_structural, name)

    for model in true['models']:
        kwargs = model.copy()
        kwargs.update(true['kwargs'])

        # Make a copy of the data
        values = dta.copy()

        freq = kwargs.pop('freq', None)
        if freq is not None:
            values.index = pd.date_range(start='1959-01-01', periods=len(dta),
                                         freq=freq)

        # Test pandas exog
        if 'exog' in kwargs:
            # Default value here is pd.Series object
            exog = np.log(values['realgdp'])

            # Also allow a check with a 1-dim numpy array
            if kwargs['exog'] == 'numpy':
                exog = exog.values.squeeze()

            kwargs['exog'] = exog

        # Create the model
        mod = UnobservedComponents(values['unemp'], **kwargs)

        # Smoke test for starting parameters, untransform, transform
        # Also test that transform and untransform are inverses
        mod.start_params
        roundtrip = mod.transform_params(
            mod.untransform_params(mod.start_params))
        assert_allclose(mod.start_params, roundtrip)

        # Fit the model at the true parameters
        res_true = mod.filter(true['params'])

        # Check that the cycle bounds were computed correctly
        freqstr = freq[0] if freq is not None else values.index.freqstr[0]
        if 'cycle_period_bounds' in kwargs:
            cycle_period_bounds = kwargs['cycle_period_bounds']
        elif freqstr == 'A':
            cycle_period_bounds = (1.5, 12)
        elif freqstr == 'Q':
            cycle_period_bounds = (1.5*4, 12*4)
        elif freqstr == 'M':
            cycle_period_bounds = (1.5*12, 12*12)
        else:
            # If we have no information on data frequency, require the
            # cycle frequency to be between 0 and pi
            cycle_period_bounds = (2, np.inf)

        # Test that the cycle frequency bound is correct
        assert_equal(mod.cycle_frequency_bound,
                     (2*np.pi / cycle_period_bounds[1],
                      2*np.pi / cycle_period_bounds[0]))

        # Test that the likelihood is correct
        rtol = true.get('rtol', 1e-7)
        atol = true.get('atol', 0)
        assert_allclose(res_true.llf, true['llf'], rtol=rtol, atol=atol)

        # Optional smoke test for plot_components
        try:
            import matplotlib.pyplot as plt
            try:
                from pandas.plotting import register_matplotlib_converters
                register_matplotlib_converters()
            except ImportError:
                pass
            fig = plt.figure()
            res_true.plot_components(fig=fig)
        except ImportError:
            pass

        # Now fit the model via MLE
        with warnings.catch_warnings(record=True):
            res = mod.fit(disp=-1)
            # If we found a higher likelihood, no problem; otherwise check
            # that we're very close to that found by R
            if res.llf <= true['llf']:
                assert_allclose(res.llf, true['llf'], rtol=1e-4)

            # Smoke test for summary
            res.summary()
Esempio n. 13
0
def test_custom_model_fit(rand_data, pre_int_period, post_int_period,
                          monkeypatch):
    fit_mock = mock.Mock()
    monkeypatch.setattr(
        'causalimpact.main.CausalImpact._process_posterior_inferences',
        mock.Mock())

    pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1], :]
    model = UnobservedComponents(endog=pre_data.iloc[:, 0],
                                 level='llevel',
                                 exog=pre_data.iloc[:, 1:])

    model.fit = fit_mock

    CausalImpact(rand_data, pre_int_period, post_int_period, model=model)
    fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2),
                                        (None, None), (None, None)],
                                disp=False,
                                nseasons=[],
                                standardize=True)

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 model=model,
                 disp=True)
    fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2),
                                        (None, None), (None, None)],
                                disp=True,
                                nseasons=[],
                                standardize=True)

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 model=model,
                 disp=True,
                 prior_level_sd=0.01)
    fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2),
                                        (None, None), (None, None)],
                                disp=True,
                                prior_level_sd=0.01,
                                nseasons=[],
                                standardize=True)

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 model=model,
                 disp=True,
                 prior_level_sd=None)
    fit_mock.assert_called_with(bounds=[(None, None), (None, None),
                                        (None, None), (None, None)],
                                disp=True,
                                prior_level_sd=None,
                                nseasons=[],
                                standardize=True)

    model = UnobservedComponents(endog=pre_data.iloc[:, 0],
                                 level='llevel',
                                 exog=pre_data.iloc[:, 1:],
                                 freq_seasonal=[{
                                     'period': 3
                                 }])
    model.fit = fit_mock

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 model=model,
                 disp=True,
                 prior_level_sd=0.001)
    fit_mock.assert_called_with(bounds=[
        (None, None), (0.001 / 1.2, 0.001 * 1.2), (None, None), (None, None),
        (None, None)
    ],
                                disp=True,
                                prior_level_sd=0.001,
                                nseasons=[],
                                standardize=True)

    model = UnobservedComponents(endog=pre_data.iloc[:, 0],
                                 level=True,
                                 exog=pre_data.iloc[:, 1],
                                 trend=True,
                                 seasonal=3,
                                 stochastic_level=True)
    model.fit = fit_mock

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 model=model,
                 disp=True,
                 prior_level_sd=0.001)
    fit_mock.assert_called_with(bounds=[(0.001 / 1.2, 0.001 * 1.2),
                                        (None, None), (None, None)],
                                disp=True,
                                prior_level_sd=0.001,
                                nseasons=[],
                                standardize=True)

    new_pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1],
                                 ['y', 'x1']]
    model = UnobservedComponents(endog=new_pre_data.iloc[:, 0],
                                 level='llevel',
                                 exog=new_pre_data.iloc[:, 1:])

    model.fit = fit_mock

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 model=model,
                 disp=False)
    fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2),
                                        (None, None)],
                                disp=False,
                                nseasons=[],
                                standardize=True)

    model = UnobservedComponents(endog=new_pre_data.iloc[:, 0],
                                 level='dtrend',
                                 exog=new_pre_data.iloc[:, 1:])
    model.fit = fit_mock

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 model=model,
                 disp=False)
    fit_mock.assert_called_with(bounds=[(None, None), (None, None)],
                                disp=False,
                                nseasons=[],
                                standardize=True)

    model = UnobservedComponents(endog=new_pre_data.iloc[:, 0],
                                 level='lltrend',
                                 exog=new_pre_data.iloc[:, 1:])
    model.fit = fit_mock

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 model=model,
                 disp=False)
    fit_mock.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.01 * 1.2),
                                        (None, None), (None, None)],
                                disp=False,
                                nseasons=[],
                                standardize=True)
Esempio n. 14
0
class CausalImpact:
    """
    Causal inference through counterfactual predictions using a Bayesian structural time-series model.
    """
    def __init__(self, data, inter_date, n_seasons=7):
        """Main constructor.

        :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'.
            See the README for more details.
        :param object inter_date: date of intervention. Must be of same type of the data index elements.
            This should usually be int of datetime.date
        :param int n_seasons: number of seasons in the seasonal component of the BSTS model

        """
        # Constructor arguments
        self.data = data.reset_index(
            drop=True)  # Input data, with a reset index
        self.inter_date = inter_date  # Date of intervention as passed in input
        self.n_seasons = n_seasons  # Number of seasons in the seasonal component of the BSTS model
        # DataFrame holding the results of the BSTS model predictions.
        self.result = None
        # Private attributes for modeling purposes only
        self._input_index = data.index  # Input data index
        self._inter_index = None  # Data intervention date, relative to the reset index
        self._model = None  # statsmodels BSTS model
        self._fit = None  # statsmodels BSTS fitted model
        # Checking input arguments
        self._check_input()
        self._check_model_args()

    def _check_input(self):
        """Check input data.
        """
        try:
            self._inter_index = self._input_index.tolist().index(
                self.inter_date)
        except ValueError:
            raise ValueError(
                'Input intervention date could not be found in data index.')
        self.result = self.data.copy()

    def _check_model_args(self):
        """Check if input arguments are compatible with the data.
        """
        if self.n_seasons < 2:
            raise ValueError(
                'Seasonal component must have a seasonal period of at least 2.'
            )
        if self._inter_index < self.n_seasons:
            raise ValueError(
                'Training data contains more samples than number of seasons in BSTS model.'
            )

    def run(self, max_iter=1000, return_df=False):
        """Fit the BSTS model to the data.

        :param int max_iter: max number of iterations in UnobservedComponents.fit (maximum likelihood estimator)
        :param bool return_df: set to `True` if you want this method to return the dataframe of model results

        :return: None or pandas.DataFrame of results
        """
        self._model = UnobservedComponents(
            self.data.loc[:self._inter_index - 1,
                          self._obs_col()].values,
            exog=self.data.loc[:self._inter_index - 1,
                               self._reg_cols()].values,
            level='local linear trend',
            seasonal=self.n_seasons,
        )
        self._fit = self._model.fit(maxiter=max_iter)
        self._get_estimates()
        self._get_difference_estimates()
        self._get_cumulative_estimates()

        if return_df:
            return self.result

    def _get_estimates(self):
        """Extracting model estimate (before and after intervention) as well as 95% confidence interval.
        """
        lpred = self._fit.get_prediction(
        )  # Left: model before date of intervention (allows to evaluate fit quality)
        rpred = self._fit.get_forecast(  # Right: best prediction of y without any intervention
            steps=self.data.shape[0] - self._inter_index,
            exog=self.data.loc[self._inter_index:,
                               self._reg_cols()])
        # Model prediction
        self.result = self.result.assign(
            pred=np.concatenate([lpred.predicted_mean, rpred.predicted_mean]))

        # 95% confidence interval
        lower_conf_ints = []
        upper_conf_ints = []
        for pred in [lpred, rpred]:
            conf_int = pred.conf_int()
            if isinstance(
                    conf_int, np.ndarray
            ):  # As of 0.9.0, statsmodels returns a np.ndarray here
                lower_conf_ints.append(conf_int[:, 0])
                upper_conf_ints.append(conf_int[:, 1])
            else:  # instead of a dataframe with "lower y" and "upper y" columns
                lower_conf_ints.append(conf_int.loc[:, 'lower y'].values)
                upper_conf_ints.append(conf_int.loc[:, 'upper y'].values)

        self.result = self.result.assign(
            pred_conf_int_lower=np.concatenate(lower_conf_ints))
        self.result = self.result.assign(
            pred_conf_int_upper=np.concatenate(upper_conf_ints))

    def _get_difference_estimates(self):
        """Extracting the difference between the model prediction and the actuals, as well as the related 95%
        confidence interval.
        """
        # Difference between actuals and model
        self.result = self.result.assign(
            pred_diff=self.data[self._obs_col()].values - self.result['pred'])
        # Confidence interval of the difference
        self.result = self.result.assign(
            pred_diff_conf_int_lower=self.data[self._obs_col()] -
            self.result['pred_conf_int_upper'])
        self.result = self.result.assign(
            pred_diff_conf_int_upper=self.data[self._obs_col()] -
            self.result['pred_conf_int_lower'])

    def _get_cumulative_estimates(self):
        """Extracting estimate of the cumulative impact of the intervention, and its 95% confidence interval.
        """
        # Cumulative sum of modeled impact
        self.result = self.result.assign(cum_impact=0)
        self.result.loc[self._inter_index:, 'cum_impact'] = (
            self.data[self._obs_col()] -
            self.result['pred']).loc[self._inter_index:].cumsum()

        # Confidence interval of the cumulative sum
        radius_cumsum = np.sqrt(
            ((self.result['pred'] - self.result['pred_conf_int_lower']
              ).loc[self._inter_index:]**2).cumsum())
        self.result = self.result.assign(cum_impact_conf_int_lower=0,
                                         cum_impact_conf_int_upper=0)
        self.result.loc[self._inter_index:, 'cum_impact_conf_int_lower'] = \
            self.result['cum_impact'].loc[self._inter_index:] - radius_cumsum
        self.result.loc[self._inter_index:, 'cum_impact_conf_int_upper'] = \
            self.result['cum_impact'].loc[self._inter_index:] + radius_cumsum

    def _obs_col(self):
        """Get name of column to be modeled in input data.

        :return: column name
        :rtype: str
        """
        return 'y'

    def _reg_cols(self):
        """Get names of columns used in the regression component of the model.

        :return: the column names
        :rtype: pandas.indexes.base.Index
        """
        return self.data.columns.difference([self._obs_col()])

    def plot_components(self):
        """Plot the estimated components of the model.
        """
        self._fit.plot_components(figsize=(15, 9), legend_loc='lower right')
        plt.show()

    def plot(self, split=False):
        """Produce final impact plots.
        Note: the first few observations are not shown due to approximate diffuse initialization.

        :param bool split: set to `True` if you want to split plot of input data into multiple charts. Default: `False`.
        """
        min_t = 2 if self.n_seasons is None else self.n_seasons + 1

        n_plots = 3 + split * len(self._reg_cols())
        grid = gs.GridSpec(n_plots, 1)
        plt.figure(figsize=(15, 4 * n_plots))

        # Observation and regression components
        ax1 = plt.subplot(grid[0, :])
        # Regression components
        for i, col in enumerate(self._reg_cols()):
            plt.plot(self.data[col], label=col)
            if split:  # Creating new subplot if charts should be split
                plt.axvline(self._inter_index, c='k', linestyle='--')
                plt.title(col)
                ax = plt.subplot(grid[i + 1, :], sharex=ax1)
                plt.setp(ax.get_xticklabels(), visible=False)
        # Model and confidence intervals
        plt.plot(self.result['pred'].iloc[min_t:],
                 'r--',
                 linewidth=2,
                 label='model')
        plt.plot(self.data[self._obs_col()],
                 'k',
                 linewidth=2,
                 label=self._obs_col())
        plt.axvline(self._inter_index, c='k', linestyle='--')
        plt.fill_between(
            self.data.index[min_t:],
            self.result['pred_conf_int_lower'].iloc[min_t:],
            self.result['pred_conf_int_upper'].iloc[min_t:],
            facecolor='gray',
            interpolate=True,
            alpha=0.25,
        )
        plt.setp(ax1.get_xticklabels(), visible=False)
        plt.legend(loc='upper left')
        plt.title('Observation vs prediction')

        # Pointwise difference
        ax2 = plt.subplot(grid[-2, :], sharex=ax1)
        plt.plot(self.result['pred_diff'].iloc[min_t:], 'r--', linewidth=2)
        plt.plot(self.data.index,
                 np.zeros(self.data.shape[0]),
                 'g-',
                 linewidth=2)
        plt.axvline(self._inter_index, c='k', linestyle='--')
        plt.fill_between(
            self.data.index[min_t:],
            self.result['pred_diff_conf_int_lower'].iloc[min_t:],
            self.result['pred_diff_conf_int_upper'].iloc[min_t:],
            facecolor='gray',
            interpolate=True,
            alpha=0.25,
        )
        plt.setp(ax2.get_xticklabels(), visible=False)
        plt.title('Difference')

        # Cumulative impact
        ax3 = plt.subplot(grid[-1, :], sharex=ax1)
        plt.plot(self.data.index,
                 self.result['cum_impact'],
                 'r--',
                 linewidth=2)
        plt.plot(self.data.index,
                 np.zeros(self.data.shape[0]),
                 'g-',
                 linewidth=2)
        plt.axvline(self._inter_index, c='k', linestyle='--')
        plt.fill_between(
            self.data.index,
            self.result['cum_impact_conf_int_lower'],
            self.result['cum_impact_conf_int_upper'],
            facecolor='gray',
            interpolate=True,
            alpha=0.25,
        )
        plt.axis([self.data.index[0], self.data.index[-1], None, None])
        ax3.set_xticklabels(self._input_index, rotation=45)
        plt.locator_params(axis='x', nbins=min(12, self.data.shape[0]))
        plt.title('Cumulative Impact')
        plt.xlabel('$T$')
        plt.show()
Esempio n. 15
0
class CausalImpact:
    """
    Causal inference through counterfactual predictions using a Bayesian structural time-series model.
    """

    def __init__(self, data, inter_date, model_args=None):
        """Main constructor.

        :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'.
            See the README for more details.
        :param object inter_date: date of intervention. Must be of same type of the data index elements.
            This should usually be int of datetime.date
        :param {str: object} model_args: parameters of the model
            > max_iter: number of samples in the MCMC sampling
            > n_seasons: number of seasons in the seasonal component of the BSTS model

        """
        self.data = None            # Input data, with a reset index
        self.data_index = None      # Data initial index
        self.data_inter = None      # Data intervention date, relative to the reset index
        self.model = None           # statsmodels BSTS model
        self.fit = None             # statsmodels BSTS fitted model
        self.model_args = None      # BSTS model arguments
        # Checking input arguments
        self._check_input(data, inter_date)
        self._check_model_args(model_args)

    def run(self):
        """Fit the BSTS model to the data.
        """
        self.model = UnobservedComponents(
            self.data.loc[:self.data_inter - 1, self._obs_col()].values,
            exog=self.data.loc[:self.data_inter - 1, self._reg_cols()].values,
            level='local linear trend',
            seasonal=self.model_args['n_seasons'],
        )
        self.fit = self.model.fit(
            maxiter=self.model_args['max_iter'],
        )

    def _check_input(self, data, inter_date):
        """Check input data.

        :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'.
            See the README for more details.
        :param object inter_date: date of intervention. Must be of same type of the data index elements.
            This should usually be int of datetime.date
        """
        self.data_index = data.index
        self.data = data.reset_index(drop=True)
        try:
            self.data_inter = self.data_index.tolist().index(inter_date)
        except ValueError:
            raise ValueError('Input intervention date could not be found in data index.')

    def _check_model_args(self, model_args):
        """Check input arguments, and add missing ones if needed.

        :return: the valid dict of arguments
        :rtype: {str: object}
        """
        if model_args is None:
            model_args = {}

        for key, val in DEFAULT_ARGS.items():
            if key not in model_args:
                model_args[key] = val

        self.model_args = model_args

    def _obs_col(self):
        """Get name of column to be modeled in input data.

        :return: column name
        :rtype: str
        """
        return 'y'

    def _reg_cols(self):
        """Get names of columns used in the regression component of the model.

        :return: the column names
        :rtype: pandas.indexes.base.Index
        """
        return self.data.columns.difference([self._obs_col()])

    def plot_components(self):
        """Plot the estimated components of the model.
        """
        self.fit.plot_components(figsize=(15, 9), legend_loc='lower right')
        plt.show()

    def plot(self):
        """Produce final impact plots.
        """
        min_t = 2 if self.model_args['n_seasons'] is None else self.model_args['n_seasons'] + 1
        # Data model before date of intervention - allows to evaluate quality of fit
        pred = self.fit.get_prediction()
        pre_model = pred.predicted_mean
        pre_lower = pred.conf_int()['lower y'].values
        pre_upper = pred.conf_int()['upper y'].values
        pre_model[:min_t] = np.nan
        pre_lower[:min_t] = np.nan
        pre_upper[:min_t] = np.nan
        # Best prediction of y without any intervention
        post_pred = self.fit.get_forecast(
            steps=self.data.shape[0] - self.data_inter,
            exog=self.data.loc[self.data_inter:, self._reg_cols()]
        )
        post_model = post_pred.predicted_mean
        post_lower = post_pred.conf_int()['lower y'].values
        post_upper = post_pred.conf_int()['upper y'].values

        plt.figure(figsize=(15, 12))

        # Observation and regression components
        ax1 = plt.subplot(3, 1, 1)
        for col in self._reg_cols():
            plt.plot(self.data[col], label=col)
        plt.plot(np.concatenate([pre_model, post_model]), 'r--', linewidth=2, label='model')
        plt.plot(self.data[self._obs_col()], 'k', linewidth=2, label=self._obs_col())
        plt.axvline(self.data_inter, c='k', linestyle='--')
        plt.fill_between(
            self.data.loc[:self.data_inter - 1].index,
            pre_lower,
            pre_upper,
            facecolor='gray', interpolate=True, alpha=0.25,
        )
        plt.fill_between(
            self.data.loc[self.data_inter:].index,
            post_lower,
            post_upper,
            facecolor='gray', interpolate=True, alpha=0.25,
        )
        plt.setp(ax1.get_xticklabels(), visible=False)
        plt.legend(loc='upper left')
        plt.title('Observation vs prediction')

        # Pointwise difference
        ax2 = plt.subplot(312, sharex=ax1)
        plt.plot(self.data[self._obs_col()] - np.concatenate([pre_model, post_model]), 'r--', linewidth=2)
        plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2)
        plt.axvline(self.data_inter, c='k', linestyle='--')
        plt.fill_between(
            self.data.loc[:self.data_inter - 1].index,
            self.data.loc[:self.data_inter - 1, self._obs_col()] - pre_lower,
            self.data.loc[:self.data_inter - 1, self._obs_col()] - pre_upper,
            facecolor='gray', interpolate=True, alpha=0.25,
        )
        plt.fill_between(
            self.data.loc[self.data_inter:].index,
            self.data.loc[self.data_inter:, self._obs_col()] - post_lower,
            self.data.loc[self.data_inter:, self._obs_col()] - post_upper,
            facecolor='gray', interpolate=True, alpha=0.25,
        )
        plt.setp(ax2.get_xticklabels(), visible=False)
        plt.title('Difference')

        # Cumulative impact
        ax3 = plt.subplot(313, sharex=ax1)
        plt.plot(
            self.data.loc[self.data_inter:].index,
            (self.data.loc[self.data_inter:, self._obs_col()] - post_model).cumsum(),
            'r--', linewidth=2,
        )
        plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2)
        plt.axvline(self.data_inter, c='k', linestyle='--')
        plt.fill_between(
            self.data.loc[self.data_inter:].index,
            (self.data.loc[self.data_inter:, self._obs_col()] - post_lower).cumsum(),
            (self.data.loc[self.data_inter:, self._obs_col()] - post_upper).cumsum(),
            facecolor='gray', interpolate=True, alpha=0.25,
        )
        plt.axis([self.data.index[0], self.data.index[-1], None, None])
        ax3.set_xticklabels(self.data_index)
        plt.title('Cumulative Impact')
        plt.xlabel('$T$')
        plt.show()

        print('Note: the first {} observations are not shown, due to approximate diffuse initialization'.format(min_t))
        
        def summary_forecast(self)
Esempio n. 16
0
def run_ucm(name):
    true = getattr(results_structural, name)

    for model in true['models']:
        kwargs = model.copy()
        kwargs.update(true['kwargs'])

        # Make a copy of the data
        values = dta.copy()

        freq = kwargs.pop('freq', None)
        if freq is not None:
            values.index = pd.date_range(start='1959-01-01', periods=len(dta),
                                  freq=freq)

        # Test pandas exog
        if 'exog' in kwargs:
            # Default value here is pd.Series object
            exog = np.log(values['realgdp'])

            # Also allow a check with a 1-dim numpy array
            if kwargs['exog'] == 'numpy':
                exog = exog.values.squeeze()
            
            kwargs['exog'] = exog

        # Create the model
        mod = UnobservedComponents(values['unemp'], **kwargs)

        # Smoke test for starting parameters, untransform, transform
        # Also test that transform and untransform are inverses
        mod.start_params
        assert_allclose(mod.start_params, mod.transform_params(mod.untransform_params(mod.start_params)))

        # Fit the model at the true parameters
        res_true = mod.filter(true['params'])

        # Check that the cycle bounds were computed correctly
        freqstr = freq[0] if freq is not None else values.index.freqstr[0]
        if 'cycle_period_bounds' in kwargs:
            cycle_period_bounds = kwargs['cycle_period_bounds']
        elif freqstr == 'A':
            cycle_period_bounds = (1.5, 12)
        elif freqstr == 'Q':
            cycle_period_bounds = (1.5*4, 12*4)
        elif freqstr == 'M':
            cycle_period_bounds = (1.5*12, 12*12)
        else:
            # If we have no information on data frequency, require the
            # cycle frequency to be between 0 and pi
            cycle_period_bounds = (2, np.inf)

        # Test that the cycle frequency bound is correct
        assert_equal(mod.cycle_frequency_bound,
            (2*np.pi / cycle_period_bounds[1],
             2*np.pi / cycle_period_bounds[0])
        )

        # Test that the likelihood is correct
        rtol = true.get('rtol', 1e-7)
        atol = true.get('atol', 0)
        assert_allclose(res_true.llf, true['llf'], rtol=rtol, atol=atol)

        # Smoke test for plot_components
        if have_matplotlib:
            fig = res_true.plot_components()
            plt.close(fig)

        # Now fit the model via MLE
        with warnings.catch_warnings(record=True) as w:
            res = mod.fit(disp=-1)
            # If we found a higher likelihood, no problem; otherwise check
            # that we're very close to that found by R
            if res.llf <= true['llf']:
                assert_allclose(res.llf, true['llf'], rtol=1e-4)

            # Smoke test for summary
            res.summary()
Esempio n. 17
0
def test_compile_posterior_inferences_w_data(data):
    pre_period = [0, 70]
    post_period = [71, 100]

    df_pre = data.loc[pre_period[0]:pre_period[1], :]
    df_post = data.loc[post_period[0]:post_period[1], :]

    post_period_response = None
    alpha = 0.05
    orig_std_params = (0., 1.)

    model = UnobservedComponents(endog=df_pre.iloc[:, 0].values,
                                 level='llevel',
                                 exog=df_pre.iloc[:, 1:].values)

    trained_model = model.fit()

    inferences = compile_posterior(trained_model, data, df_pre, df_post,
                                   post_period_response, alpha,
                                   orig_std_params)

    expected_response = pd.Series(data.iloc[:, 0], name='response')
    assert_series_equal(expected_response, inferences['series']['response'])

    expected_cumsum = pd.Series(np.cumsum(expected_response),
                                name='cum_response')

    assert_series_equal(expected_cumsum, inferences['series']['cum_response'])

    predictor = trained_model.get_prediction()
    forecaster = trained_model.get_forecast(
        steps=len(df_post),
        exog=df_post.iloc[:, 1].values.reshape(-1, 1),
        alpha=alpha)

    pre_pred = predictor.predicted_mean
    post_pred = forecaster.predicted_mean

    point_pred = np.concatenate([pre_pred, post_pred])

    expected_point_pred = pd.Series(point_pred, name='point_pred')
    assert_series_equal(expected_point_pred,
                        inferences['series']['point_pred'])

    pre_ci = pd.DataFrame(predictor.conf_int(alpha=alpha))
    pre_ci.index = df_pre.index
    post_ci = pd.DataFrame(forecaster.conf_int(alpha=alpha))
    post_ci.index = df_post.index

    ci = pd.concat([pre_ci, post_ci])

    expected_pred_upper = ci.iloc[:, 1]
    expected_pred_upper = expected_pred_upper.rename('point_pred_upper')
    expected_pred_lower = ci.iloc[:, 0]
    expected_pred_lower = expected_pred_lower.rename('point_pred_lower')

    assert_series_equal(expected_pred_upper,
                        inferences['series']['point_pred_upper'])
    assert_series_equal(expected_pred_lower,
                        inferences['series']['point_pred_lower'])

    expected_cum_pred = pd.Series(np.cumsum(point_pred), name='cum_pred')
    assert_series_equal(expected_cum_pred, inferences['series']['cum_pred'])

    expected_cum_pred_lower = pd.Series(np.cumsum(expected_pred_lower),
                                        name='cum_pred_lower')
    assert_series_equal(expected_cum_pred_lower,
                        inferences['series']['cum_pred_lower'])

    expected_cum_pred_upper = pd.Series(np.cumsum(expected_pred_upper),
                                        name='cum_pred_upper')
    assert_series_equal(expected_cum_pred_upper,
                        inferences['series']['cum_pred_upper'])

    expected_point_effect = pd.Series(expected_response - expected_point_pred,
                                      name='point_effect')
    assert_series_equal(expected_point_effect,
                        inferences['series']['point_effect'])

    expected_point_effect_lower = pd.Series(expected_response -
                                            expected_pred_lower,
                                            name='point_effect_lower')
    assert_series_equal(expected_point_effect_lower,
                        inferences['series']['point_effect_lower'])

    expected_point_effect_upper = pd.Series(expected_response -
                                            expected_pred_upper,
                                            name='point_effect_upper')
    assert_series_equal(expected_point_effect_upper,
                        inferences['series']['point_effect_upper'])

    expected_cum_effect = pd.Series(np.concatenate(
        (np.zeros(len(df_pre)),
         np.cumsum(expected_point_effect.iloc[len(df_pre):]))),
                                    name='cum_effect')
    assert_series_equal(expected_cum_effect,
                        inferences['series']['cum_effect'])

    expected_cum_effect_lower = pd.Series(np.concatenate(
        (np.zeros(len(df_pre)),
         np.cumsum(expected_point_effect_lower.iloc[len(df_pre):]))),
                                          name='cum_effect_lower')
    assert_series_equal(expected_cum_effect_lower,
                        inferences['series']['cum_effect_lower'])

    expected_cum_effect_upper = pd.Series(np.concatenate(
        (np.zeros(len(df_pre)),
         np.cumsum(expected_point_effect_upper.iloc[len(df_pre):]))),
                                          name='cum_effect_upper')
    assert_series_equal(expected_cum_effect_upper,
                        inferences['series']['cum_effect_upper'])
Esempio n. 18
0
def construct_model(data, model_args=None):
    """Specifies the model and performs inference. Inference means using the data
    to pass from a prior distribution over parameters and states to a posterior
    distribution. In a Bayesian framework, estimating a model means to obtain
    p(parameters | data) from p(data | parameters) and p(parameters). This
    involves multiplying the prior with the likelihood and normalising the
    resulting distribution using the marginal likelihood or model evidence,
    p(data). Computing the evidence poses a virtually intractable
    high-dimensional integration problem which can be turned into an easier
    optimization problem using, for instance, an approximate stochastic
    inference strategy. Here, we use a Markov chain Monte Carlo algorithm, as
    implemented in the {pymc} package.
    Args:
      data: time series of response variable and optional covariates
      model_args: optional list of additional model arguments

    Returns:
      {bsts_model}, as returned by {Bsts()}

    """
    from statsmodels.tsa.statespace.structural import UnobservedComponents

    # extract y variable
    y = data.iloc[:, 0]

    # If the series is ill-conditioned, abort inference
    observations_ill_conditioned(y)

    # specification params
    ss = {}
    # Local level
    ss["endog"] = y
    ss["level"] = "llevel"

    # Add seasonal component?
    if model_args["nseasons"] > 1:
        ss["seasonal_period"] = model_args["season_duration"]

    # No regression?
    if len(data.columns) == 1:
        mod = UnobservedComponents(**ss)
        return mod
    else:
        # Static regression?
        if not model_args["dynamic_regression"]:
            ss["exog"] = data.iloc[:, 1:]
            mod = UnobservedComponents(**ss)
            return mod
        # Dynamic regression?
        else:
            """Since we have predictor variables in the model, we need to
            explicitly make their coefficients time-varying using
            AddDynamicRegression(). In Bsts(), we are therefore not giving a
            formula but just the response variable. We are then using SdPrior
            to only specify the prior on the residual standard deviation.

            prior_mean: precision of random walk of coefficients
            sigma_mean_prior = gamma_prior(prior_mean=1, a=4)
            ss = add_dynamic_regression(ss, formula, data=data,
                                        sigma_mean_prior=sigma_mean_prior)
            sd_prior = sd_prior(sigma_guess=model_args["prior_level_sd"] * sdy,
                                upper_limit=0.1 * sdy,
                                sample_size=kDynamicRegressionPriorSampleSize)

            bsts_model = Bsts(y, state_specification=ss,
                              niter=model_args["niter"],
                              expected_model_size=3, ping=0, seed=1,
                              prior=sd_prior)
            """
            raise NotImplementedError()
Esempio n. 19
0
def run_ucm(name, use_exact_diffuse=False):
    true = getattr(results_structural, name)

    for model in true['models']:
        kwargs = model.copy()
        kwargs.update(true['kwargs'])
        kwargs['use_exact_diffuse'] = use_exact_diffuse

        # Make a copy of the data
        values = dta.copy()

        freq = kwargs.pop('freq', None)
        if freq is not None:
            values.index = pd.date_range(start='1959-01-01',
                                         periods=len(dta),
                                         freq=freq)

        # Test pandas exog
        if 'exog' in kwargs:
            # Default value here is pd.Series object
            exog = np.log(values['realgdp'])

            # Also allow a check with a 1-dim numpy array
            if kwargs['exog'] == 'numpy':
                exog = exog.values.squeeze()

            kwargs['exog'] = exog

        # Create the model
        mod = UnobservedComponents(values['unemp'], **kwargs)

        # Smoke test for starting parameters, untransform, transform
        # Also test that transform and untransform are inverses
        mod.start_params
        roundtrip = mod.transform_params(
            mod.untransform_params(mod.start_params))
        assert_allclose(mod.start_params, roundtrip)

        # Fit the model at the true parameters
        res_true = mod.filter(true['params'])

        # Check that the cycle bounds were computed correctly
        freqstr = freq[0] if freq is not None else values.index.freqstr[0]
        if 'cycle_period_bounds' in kwargs:
            cycle_period_bounds = kwargs['cycle_period_bounds']
        elif freqstr == 'A':
            cycle_period_bounds = (1.5, 12)
        elif freqstr == 'Q':
            cycle_period_bounds = (1.5 * 4, 12 * 4)
        elif freqstr == 'M':
            cycle_period_bounds = (1.5 * 12, 12 * 12)
        else:
            # If we have no information on data frequency, require the
            # cycle frequency to be between 0 and pi
            cycle_period_bounds = (2, np.inf)

        # Test that the cycle frequency bound is correct
        assert_equal(mod.cycle_frequency_bound,
                     (2 * np.pi / cycle_period_bounds[1],
                      2 * np.pi / cycle_period_bounds[0]))

        # Test that the likelihood is correct
        rtol = true.get('rtol', 1e-7)
        atol = true.get('atol', 0)

        if use_exact_diffuse:
            # If we are using exact diffuse initialization, then we need to
            # adjust for the fact that KFAS does not include the constant in
            # the likelihood function for the diffuse periods
            # (see note to test_exact_diffuse_filtering.py for details).
            res_llf = (res_true.llf_obs.sum() +
                       res_true.nobs_diffuse * 0.5 * np.log(2 * np.pi))
        else:
            # If we are using approximate diffuse initialization, then we need
            # to ignore the first period, and this will agree with KFAS (since
            # it does not include the constant in the likelihood function for
            # diffuse periods).
            res_llf = res_true.llf_obs[res_true.loglikelihood_burn:].sum()

        assert_allclose(res_llf, true['llf'], rtol=rtol, atol=atol)

        # Optional smoke test for plot_components
        try:
            import matplotlib.pyplot as plt
            try:
                from pandas.plotting import register_matplotlib_converters
                register_matplotlib_converters()
            except ImportError:
                pass
            fig = plt.figure()
            res_true.plot_components(fig=fig)
        except ImportError:
            pass

        # Now fit the model via MLE
        with warnings.catch_warnings(record=True):
            fit_kwargs = {}
            if 'maxiter' in true:
                fit_kwargs['maxiter'] = true['maxiter']
            res = mod.fit(start_params=true.get('start_params', None),
                          disp=-1,
                          **fit_kwargs)
            # If we found a higher likelihood, no problem; otherwise check
            # that we're very close to that found by R

            # See note above about these computation
            if use_exact_diffuse:
                res_llf = (res.llf_obs.sum() +
                           res.nobs_diffuse * 0.5 * np.log(2 * np.pi))
            else:
                res_llf = res.llf_obs[res_true.loglikelihood_burn:].sum()

            if res_llf <= true['llf']:
                assert_allclose(res_llf, true['llf'], rtol=1e-4)

            # Smoke test for summary
            res.summary()
Esempio n. 20
0
class CausalImpact:
    """
    Causal inference through counterfactual predictions using a Bayesian structural time-series model.
    """

    def __init__(self, data, inter_date, model_args=None):
        """Main constructor.

        :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'.
            See the README for more details.
        :param object inter_date: date of intervention. Must be of same type of the data index elements.
            This should usually be int of datetime.date
        :param {str: object} model_args: parameters of the model
            > max_iter: number of samples in the MCMC sampling
            > n_seasons: number of seasons in the seasonal component of the BSTS model

        """
        # Publicly exposed attributes
        self.data = None            # Input data, with a reset index
        self.data_index = None      # Data initial index
        self.data_inter = None      # Data intervention date, relative to the reset index
        self.model_args = None      # BSTS model arguments
        self.result = None          #
        # Private attributes for modeling purposes only
        self._model = None          # statsmodels BSTS model
        self._fit = None            # statsmodels BSTS fitted model
        # Checking input arguments
        self._check_input(data, inter_date)
        self._check_model_args(data, model_args)

    def _check_input(self, data, inter_date):
        """Check input data.

        :param pandas.DataFrame data: input data. Must contain at least 2 columns, one being named 'y'.
            See the README for more details.
        :param object inter_date: date of intervention. Must be of same type of the data index elements.
            This should usually be int of datetime.date
        """
        self.data_index = data.index
        self.data = data.reset_index(drop=True)
        try:
            self.data_inter = self.data_index.tolist().index(inter_date)
        except ValueError:
            raise ValueError('Input intervention date could not be found in data index.')
        self.result = data.reset_index(drop=False)

    def _check_model_args(self, data, model_args):
        """Check input arguments, and add missing ones if needed.

        :return: the valid dict of arguments
        :rtype: {str: object}
        """
        if model_args is None:
            model_args = {}

        for key, val in DEFAULT_ARGS.items():
            if key not in model_args:
                model_args[key] = val

        if self.data_inter < model_args['n_seasons']:
            raise ValueError('Training data contains more samples than number of seasons in BSTS model.')

        self.model_args = model_args

    def run(self, return_df=False):
        """Fit the BSTS model to the data.
        """
        self._model = UnobservedComponents(
            self.data.loc[:self.data_inter - 1, self._obs_col()].values,
            exog=self.data.loc[:self.data_inter - 1, self._reg_cols()].values,
            level='local linear trend',
            seasonal=self.model_args['n_seasons'],
        )
        self._fit = self._model.fit(
            maxiter=self.model_args['max_iter'],
        )
        self._get_estimates()
        self._get_difference_estimates()
        self._get_cumulative_estimates()

        if return_df:
            return self.result

    def _get_estimates(self):
        """Extracting model estimate (before and after intervention) as well as 95% confidence interval.
        """
        lpred = self._fit.get_prediction()   # Left: model before date of intervention (allows to evaluate fit quality)
        rpred = self._fit.get_forecast(      # Right: best prediction of y without any intervention
            steps=self.data.shape[0] - self.data_inter,
            exog=self.data.loc[self.data_inter:, self._reg_cols()]
        )
        # Model prediction
        self.result = self.result.assign(pred=np.concatenate([lpred.predicted_mean, rpred.predicted_mean]))

        # 95% confidence interval
        lower_conf_ints = []
        upper_conf_ints = []
        for pred in [lpred, rpred]:
            conf_int = pred.conf_int()
            if isinstance(conf_int, np.ndarray):    # As of 0.9.0, statsmodels returns a np.ndarray here
                lower_conf_ints.append(conf_int[:, 0])
                upper_conf_ints.append(conf_int[:, 1])
            else:                                   # instead of a dataframe with "lower y" and "upper y" columns
                lower_conf_ints.append(conf_int.loc[:, 'lower y'].values)
                upper_conf_ints.append(conf_int.loc[:, 'upper y'].values)

        self.result = self.result.assign(pred_conf_int_lower=np.concatenate(lower_conf_ints))
        self.result = self.result.assign(pred_conf_int_upper=np.concatenate(upper_conf_ints))

    def _get_difference_estimates(self):
        """Extracting the difference between the model prediction and the actuals, as well as the related 95%
        confidence interval.
        """
        # Difference between actuals and model
        self.result = self.result.assign(pred_diff=self.data[self._obs_col()].values - self.result['pred'])
        # Confidence interval of the difference
        self.result = self.result.assign(
            pred_diff_conf_int_lower=self.data[self._obs_col()] - self.result['pred_conf_int_upper']
        )
        self.result = self.result.assign(
            pred_diff_conf_int_upper=self.data[self._obs_col()] - self.result['pred_conf_int_lower']
        )

    def _get_cumulative_estimates(self):
        """Extracting estimate of the cumulative impact of the intervention, and its 95% confidence interval.
        """
        # Cumulative sum of modeled impact
        self.result = self.result.assign(cum_impact=0)
        self.result.loc[self.data_inter:, 'cum_impact'] = (
            self.data[self._obs_col()] - self.result['pred']
        ).loc[self.data_inter:].cumsum()

        # Confidence interval of the cumulative sum
        radius_cumsum = np.sqrt(
            ((self.result['pred'] - self.result['pred_conf_int_lower']).loc[self.data_inter:] ** 2).cumsum()
        )
        self.result = self.result.assign(cum_impact_conf_int_lower=0, cum_impact_conf_int_upper=0)
        self.result.loc[self.data_inter:, 'cum_impact_conf_int_lower'] = \
            self.result['cum_impact'].loc[self.data_inter:] - radius_cumsum
        self.result.loc[self.data_inter:, 'cum_impact_conf_int_upper'] = \
            self.result['cum_impact'].loc[self.data_inter:] + radius_cumsum

    def _obs_col(self):
        """Get name of column to be modeled in input data.

        :return: column name
        :rtype: str
        """
        return 'y'

    def _reg_cols(self):
        """Get names of columns used in the regression component of the model.

        :return: the column names
        :rtype: pandas.indexes.base.Index
        """
        return self.data.columns.difference([self._obs_col()])

    def plot_components(self):
        """Plot the estimated components of the model.
        """
        self._fit.plot_components(figsize=(15, 9), legend_loc='lower right')
        plt.show()

    def plot(self):
        """Produce final impact plots.
        Note: the first few observations are not shown due to approximate diffuse initialization.
        """
        min_t = 2 if self.model_args['n_seasons'] is None else self.model_args['n_seasons'] + 1

        plt.figure(figsize=(15, 12))

        # Observation and regression components
        ax1 = plt.subplot(3, 1, 1)
        for col in self._reg_cols():
            plt.plot(self.data[col], label=col)
        plt.plot(self.result['pred'].iloc[min_t:], 'r--', linewidth=2, label='model')
        plt.plot(self.data[self._obs_col()], 'k', linewidth=2, label=self._obs_col())
        plt.axvline(self.data_inter, c='k', linestyle='--')
        plt.fill_between(
            self.data.index[min_t:],
            self.result['pred_conf_int_lower'].iloc[min_t:],
            self.result['pred_conf_int_upper'].iloc[min_t:],
            facecolor='gray', interpolate=True, alpha=0.25,
        )
        plt.setp(ax1.get_xticklabels(), visible=False)
        plt.legend(loc='upper left')
        plt.title('Observation vs prediction')

        # Pointwise difference
        ax2 = plt.subplot(312, sharex=ax1)
        plt.plot(self.result['pred_diff'].iloc[min_t:], 'r--', linewidth=2)
        plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2)
        plt.axvline(self.data_inter, c='k', linestyle='--')
        plt.fill_between(
            self.data.index[min_t:],
            self.result['pred_diff_conf_int_lower'].iloc[min_t:],
            self.result['pred_diff_conf_int_upper'].iloc[min_t:],
            facecolor='gray', interpolate=True, alpha=0.25,
        )
        plt.setp(ax2.get_xticklabels(), visible=False)
        plt.title('Difference')

        # Cumulative impact
        ax3 = plt.subplot(313, sharex=ax1)
        plt.plot(self.data.index, self.result['cum_impact'], 'r--', linewidth=2)
        plt.plot(self.data.index, np.zeros(self.data.shape[0]), 'g-', linewidth=2)
        plt.axvline(self.data_inter, c='k', linestyle='--')
        plt.fill_between(
            self.data.index,
            self.result['cum_impact_conf_int_lower'],
            self.result['cum_impact_conf_int_upper'],
            facecolor='gray', interpolate=True, alpha=0.25,
        )
        plt.axis([self.data.index[0], self.data.index[-1], None, None])
        ax3.set_xticklabels(self.data_index, rotation=45)
        plt.locator_params(axis='x', nbins=min(12, self.data.shape[0]))
        plt.title('Cumulative Impact')
        plt.xlabel('$T$')
        plt.show()
Esempio n. 21
0
def test_matrices_somewhat_complicated_model():
    values = dta.copy()

    model = UnobservedComponents(values['unemp'],
                                 level='lltrend',
                                 freq_seasonal=[{'period': 4},
                                                {'period': 9, 'harmonics': 3}],
                                 cycle=True,
                                 cycle_period_bounds=[2, 30],
                                 damped_cycle=True,
                                 stochastic_freq_seasonal=[True, False],
                                 stochastic_cycle=True
                                 )
    # Selected parameters
    params = [1,  # irregular_var
              3, 4,  # lltrend parameters:  level_var, trend_var
              5,   # freq_seasonal parameters: freq_seasonal_var_0
              # cycle parameters: cycle_var, cycle_freq, cycle_damp
              6, 2*np.pi/30., .9
              ]
    model.update(params)

    # Check scalar properties
    assert_equal(model.k_states, 2 + 4 + 6 + 2)
    assert_equal(model.k_state_cov, 2 + 1 + 0 + 1)
    assert_equal(model.loglikelihood_burn, 2 + 4 + 6 + 2)
    assert_allclose(model.ssm.k_posdef, 2 + 4 + 0 + 2)
    assert_equal(model.k_params, len(params))

    # Check the statespace model matrices against hand-constructed answers
    # We group the terms by the component
    expected_design = np.r_[[1, 0],
                            [1, 0, 1, 0],
                            [1, 0, 1, 0, 1, 0],
                            [1, 0]].reshape(1, 14)
    assert_allclose(model.ssm.design[:, :, 0], expected_design)

    expected_transition = __direct_sum([
        np.array([[1, 1],
                  [0, 1]]),
        np.array([[0, 1, 0, 0],
                  [-1, 0, 0, 0],
                  [0, 0, -1,  0],
                  [0, 0,  0, -1]]),
        np.array([[np.cos(2*np.pi*1/9.), np.sin(2*np.pi*1/9.), 0, 0, 0, 0],
                  [-np.sin(2*np.pi*1/9.), np.cos(2*np.pi*1/9.), 0, 0, 0, 0],
                  [0, 0,  np.cos(2*np.pi*2/9.), np.sin(2*np.pi*2/9.), 0, 0],
                  [0, 0, -np.sin(2*np.pi*2/9.), np.cos(2*np.pi*2/9.), 0, 0],
                  [0, 0, 0, 0,  np.cos(2*np.pi/3.), np.sin(2*np.pi/3.)],
                  [0, 0, 0, 0, -np.sin(2*np.pi/3.), np.cos(2*np.pi/3.)]]),
        np.array([[.9*np.cos(2*np.pi/30.), .9*np.sin(2*np.pi/30.)],
                 [-.9*np.sin(2*np.pi/30.), .9*np.cos(2*np.pi/30.)]])
    ])
    assert_allclose(
        model.ssm.transition[:, :, 0], expected_transition, atol=1e-7)

    # Since the second seasonal term is not stochastic,
    # the dimensionality of the state disturbance is 14 - 6 = 8
    expected_selection = np.zeros((14, 14 - 6))
    expected_selection[0:2, 0:2] = np.eye(2)
    expected_selection[2:6, 2:6] = np.eye(4)
    expected_selection[-2:, -2:] = np.eye(2)
    assert_allclose(model.ssm.selection[:, :, 0], expected_selection)

    expected_state_cov = __direct_sum([
        np.diag(params[1:3]),
        np.eye(4)*params[3],
        np.eye(2)*params[4]
    ])
    assert_allclose(model.ssm.state_cov[:, :, 0], expected_state_cov)
Esempio n. 22
0
    def fit(self, X, y=None):
        # Perform top percentile ceiling
        self.X = X
        mode = self.mode
        if '*f' in self.mode:
            self.X = np.minimum(X, np.percentile(X, 75))
            mode = self.mode.partition('*f')[0]
        # Perform transformation if specified by *transformation
        if '*ln' in self.mode:
            self.X = np.log(np.array(X) + 1)
            mode = self.mode.partition('*ln')[0]
        elif '*bc' in self.mode:
            transformer = pm.preprocessing.BoxCoxEndogTransformer()
            self.X = transformer.fit_transform(y=X)
            self.transformer = transformer
            mode = self.mode.partition('*bc')[0]

        try:
            if mode == 'll':
                # Local Level
                model = LocalLevel(self.X)
                self.res_ = model.fit(disp=False)
                self.k_exog = None
            elif mode == 'lla':
                endog = X[2:]
                exog = np.column_stack((X[1:-1], X[:-2]))
                self.k_exog = exog.shape[1]
                model = UnobservedComponents(endog=endog,
                                             exog=exog,
                                             level='local level')
                self.res_ = model.fit(disp=False)
            elif mode == 'lls':
                self.k_exog = None
                model = SARIMAX(endog=self.X,
                                order=(2, 0, 0),
                                trend='c',
                                measurement_error=True)
                self.res_ = model.fit(disp=False)
            elif mode == 'llt':
                # Local Linear Trend
                model = UnobservedComponents(endog=self.X,
                                             level='local linear trend')
                self.res_ = model.fit(disp=False)
            elif mode == 'llc':
                # Local Level Cycle
                model = UnobservedComponents(endog=self.X,
                                             level='local level',
                                             cycle=True,
                                             stochastic_cycle=True)
                self.res_ = model.fit(disp=False)
            elif mode == 'arima':
                self.res_ = pm.auto_arima(self.X,
                                          start_p=1,
                                          start_q=1,
                                          start_P=1,
                                          start_Q=1,
                                          max_p=5,
                                          max_q=5,
                                          max_P=5,
                                          max_Q=5,
                                          seasonal=True,
                                          stepwise=True,
                                          suppress_warnings=True,
                                          D=10,
                                          max_D=10,
                                          error_action='ignore')
            elif mode == 'rw1':
                # For RW model
                self.res_ = None
                self.converged = False
        except np.linalg.LinAlgError:
            # Some kalman filter error ==> Use random walk
            print(f'Convergence failed for {mode}')
            self.converged = False
            return self
        try:
            self.converged = self.res_.mle_retvals['converged']
        except AttributeError:
            if mode == 'arima':
                self.converged = True  # auto ARIMA from pmdarima should always converge
        return self
Esempio n. 23
0
def test_default_model_fit(rand_data, pre_int_period, post_int_period,
                           monkeypatch):
    pre_data = rand_data.loc[pre_int_period[0]:pre_int_period[1], :]
    fit_mock = mock.Mock()
    model = UnobservedComponents(endog=pre_data.iloc[:, 0],
                                 level='llevel',
                                 exog=pre_data.iloc[:, 1:])

    model.fit = fit_mock

    construct_mock = mock.Mock(return_value=model)

    monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model',
                        construct_mock)
    monkeypatch.setattr(
        'causalimpact.main.CausalImpact._process_posterior_inferences',
        mock.Mock())

    CausalImpact(rand_data, pre_int_period, post_int_period)
    model.fit.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.012),
                                         (None, None), (None, None)],
                                 disp=False,
                                 nseasons=[],
                                 standardize=True)

    CausalImpact(rand_data, pre_int_period, post_int_period, disp=True)
    model.fit.assert_called_with(bounds=[(None, None), (0.01 / 1.2, 0.012),
                                         (None, None), (None, None)],
                                 disp=True,
                                 nseasons=[],
                                 standardize=True)

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 disp=True,
                 prior_level_sd=0.1)
    model.fit.assert_called_with(bounds=[(None, None), (0.1 / 1.2, 0.1 * 1.2),
                                         (None, None), (None, None)],
                                 disp=True,
                                 prior_level_sd=0.1,
                                 nseasons=[],
                                 standardize=True)

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 disp=True,
                 prior_level_sd=None)
    model.fit.assert_called_with(bounds=[(None, None), (None, None),
                                         (None, None), (None, None)],
                                 disp=True,
                                 prior_level_sd=None,
                                 nseasons=[],
                                 standardize=True)

    model = UnobservedComponents(endog=pre_data.iloc[:, 0],
                                 level='llevel',
                                 exog=pre_data.iloc[:, 1:],
                                 freq_seasonal=[{
                                     'period': 3
                                 }])

    model.fit = fit_mock

    construct_mock = mock.Mock(return_value=model)

    monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model',
                        construct_mock)

    CausalImpact(rand_data,
                 pre_int_period,
                 post_int_period,
                 disp=True,
                 prior_level_sd=0.001,
                 nseasons=[{
                     'period': 3
                 }])
    model.fit.assert_called_with(bounds=[(None, None),
                                         (0.001 / 1.2, 0.001 * 1.2),
                                         (None, None), (None, None),
                                         (None, None)],
                                 disp=True,
                                 prior_level_sd=0.001,
                                 nseasons=[{
                                     'period': 3
                                 }],
                                 standardize=True)

    model = UnobservedComponents(endog=pre_data.iloc[:, 0], level='llevel')

    model.fit = fit_mock

    construct_mock = mock.Mock(return_value=model)

    monkeypatch.setattr('causalimpact.main.CausalImpact._get_default_model',
                        construct_mock)

    new_data = pd.DataFrame(np.random.randn(200, 1), columns=['y'])
    CausalImpact(new_data, pre_int_period, post_int_period, disp=False)
    model.fit.assert_called_with(bounds=[(None, None),
                                         (0.01 / 1.2, 0.01 * 1.2)],
                                 disp=False,
                                 nseasons=[],
                                 standardize=True)