def summary(summary_data: pd.DataFrame, p_value: float, alpha: float = 0.05, output: str = 'summary', digits: int = 2) -> str: """ Returns final results from causal impact analysis, such as absolute observed effect, the relative effect between prediction and observed variable, cumulative performances in post-intervention period among other metrics. Args ---- summary_data: pd.DataFrame Contains information such as means and cumulatives averages. p_value: float p-value test for testing presence of signal in data. alpha: float Sets credible interval width. output: str Can be either "summary" or "report". The first is a simpler output just informing general metrics such as expected absolute or relative effect. digits: int Defines the number of digits after the decimal point to round. For `digits=2`, value 1.566 becomes 1.57. Returns ------- summary: str Contains results of the causal impact analysis. Raises ------ ValueError: If input `output` is not either 'summary' or 'report'. """ if output not in {'summary', 'report'}: raise ValueError('Please choose either summary or report for output.') if output == 'summary': summary = SUMMARY_TMPL.render(summary=summary_data.to_dict(), alpha=alpha, z_score=get_z_score(1 - alpha / 2.), p_value=p_value, digits=digits) else: summary = REPORT_TMPL.render(summary=summary_data.to_dict(), alpha=alpha, p_value=p_value, digits=digits) return summary
def summary(self, output='summary', digits=2): """ Returns final results from causal impact analysis, such as absolute observed effect, the relative effect between prediction and observed variable, cumulative performances in post-intervention period among other metrics. Args ---- output: str. Can be either "summary" or "report". The first is a simpler output just informing general metrics such as expected absolute or relative effect. digits: int. Defines the number of digits after the decimal point to round. For digits=2, value 1.566 becomes 1.57. Returns ------- summary: str. Contains results of the causal impact analysis. Raises ------ RuntimeError: if `self.summary_data` is None meaning the post inference compilation was not performed yet. """ if self.summary_data is None: raise RuntimeError( 'Posterior inferences must be first computed before ' 'running summary.') if output not in {'summary', 'report'}: raise ValueError( 'Please choose either summary or report for output.') if output == 'summary': summary = SUMMARY_TMPL.render(summary=self.summary_data.to_dict(), alpha=self.alpha, z_score=get_z_score(1 - self.alpha / 2.), p_value=self.p_value, digits=digits) else: summary = REPORT_TMPL.render(summary=self.summary_data.to_dict(), alpha=self.alpha, p_value=self.p_value, digits=digits) return summary
def test_get_z_score(): assert get_z_score(0.5) == 0. assert round(get_z_score(0.9177), 2) == 1.39
def test_compile_posterior_inferences(): data = pd.DataFrame(np.arange(10)) pre_data = data.iloc[:3] post_data = data.iloc[7:] one_step_mean = 3 one_step_stddev = 1.5 posterior_mean = 7.5 posterior_stddev = 1.5 alpha = 0.05 mu = 1 sig = 2 mu_sig = (mu, sig) niter = 10 class OneStepDist: def mean(self): return np.ones((len(pre_data), 1)) * one_step_mean def stddev(self): return np.ones((len(pre_data), 1)) * one_step_stddev class PosteriorDist: def sample(self, niter): tmp = tf.convert_to_tensor( np.tile(np.arange(start=7.1, stop=10.1, step=1), (niter, 1)) + np.arange(niter).reshape(-1, 1), dtype=np.float32) tmp = tmp[..., tf.newaxis] return tmp def mean(self): return np.ones((len(post_data), 1)) * posterior_mean def stddev(self): return np.ones((len(post_data), 1)) * posterior_stddev one_step_dist = OneStepDist() posterior_dist = PosteriorDist() inferences = inferrer.compile_posterior_inferences(pre_data, post_data, one_step_dist, posterior_dist, mu_sig, alpha=alpha, niter=niter) expected_index = np.array([0, 1, 2, 7, 8, 9]) # test complete_preds_means expec_complete_preds_means = pd.DataFrame(data=np.array( [7, 7, 7, 16, 16, 16]), index=expected_index, dtype=np.float64, columns=['complete_preds_means']) pd.testing.assert_series_equal( expec_complete_preds_means['complete_preds_means'], inferences['complete_preds_means']) # test complete_preds_lower pre_preds_lower = (np.array([1, 1, 1]) * one_step_mean - get_z_score(1 - alpha / 2) * one_step_stddev) * sig + mu pre_preds_lower[ np.abs(pre_preds_lower) > np.quantile(pre_preds_lower, 0.5) + 3 * np.std(pre_preds_lower)] = np.nan post_preds_lower = ( np.array([1, 1, 1]) * posterior_mean - get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu expec_complete_preds_lower = np.concatenate( [pre_preds_lower, post_preds_lower]) expec_complete_preds_lower = pd.DataFrame(data=expec_complete_preds_lower, index=expected_index, dtype=np.float64, columns=['complete_preds_lower']) pd.testing.assert_series_equal( expec_complete_preds_lower['complete_preds_lower'], inferences['complete_preds_lower']) # test complete_preds_upper pre_preds_upper = (np.array([1, 1, 1]) * one_step_mean + get_z_score(1 - alpha / 2) * one_step_stddev) * sig + mu pre_preds_upper[ np.abs(pre_preds_upper) > np.quantile(pre_preds_upper, 0.5) + 3 * np.std(pre_preds_upper)] = np.nan post_preds_upper = ( np.array([1, 1, 1]) * posterior_mean + get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu expec_complete_preds_upper = np.concatenate( [pre_preds_upper, post_preds_upper]) expec_complete_preds_upper = pd.DataFrame(data=expec_complete_preds_upper, index=expected_index, dtype=np.float64, columns=['complete_preds_upper']) pd.testing.assert_series_equal( expec_complete_preds_upper['complete_preds_upper'], inferences['complete_preds_upper']) # test post_preds_means expec_post_preds_means = pd.DataFrame( data=np.array([np.nan] * 3 + [posterior_mean * sig + mu] * len(pre_data)), index=expected_index, dtype=np.float64, columns=['post_preds_means']) pd.testing.assert_series_equal(expec_post_preds_means['post_preds_means'], inferences['post_preds_means']) # test post_preds_lower post_preds_lower = ( np.array([np.nan] * 3 + [1, 1, 1]) * posterior_mean - get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu expec_post_preds_lower = pd.DataFrame(data=post_preds_lower, index=expected_index, dtype=np.float64, columns=['post_preds_lower']) pd.testing.assert_series_equal(expec_post_preds_lower['post_preds_lower'], inferences['post_preds_lower']) # test post_preds_upper post_preds_upper = ( np.array([np.nan] * 3 + [1, 1, 1]) * posterior_mean + get_z_score(1 - alpha / 2) * posterior_stddev) * sig + mu expec_post_preds_upper = pd.DataFrame(data=post_preds_upper, index=expected_index, dtype=np.float64, columns=['post_preds_upper']) pd.testing.assert_series_equal(expec_post_preds_upper['post_preds_upper'], inferences['post_preds_upper']) # test post_cum_Y post_cum_y = np.concatenate([[np.nan] * (len(pre_data) - 1) + [0], np.cumsum(post_data.iloc[:, 0])]) expec_post_cum_y = pd.DataFrame(data=post_cum_y, index=expected_index, dtype=np.float64, columns=['post_cum_y']) pd.testing.assert_series_equal(expec_post_cum_y['post_cum_y'], inferences['post_cum_y']) # test post_cum_preds_means expec_post_cum_preds_means = np.cumsum(expec_post_preds_means) expec_post_cum_preds_means.rename( columns={'post_preds_means': 'post_cum_preds_means'}, inplace=True) expec_post_cum_preds_means['post_cum_preds_means'][len(pre_data) - 1] = 0 pd.testing.assert_series_equal( expec_post_cum_preds_means['post_cum_preds_means'], inferences['post_cum_preds_means']) # test post_cum_preds_lower post_cum_preds_lower, post_cum_preds_upper = np.percentile(np.cumsum( maybe_unstandardize(np.squeeze(posterior_dist.sample(niter)), mu_sig), axis=1), [100 * alpha / 2, 100 - 100 * alpha / 2], axis=0) post_cum_preds_lower = np.concatenate( [np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_preds_lower]) expec_post_cum_preds_lower = pd.DataFrame(data=post_cum_preds_lower, index=expected_index, dtype=np.float64, columns=['post_cum_preds_lower']) pd.testing.assert_series_equal( expec_post_cum_preds_lower['post_cum_preds_lower'], inferences['post_cum_preds_lower']) # test post_cum_preds_upper post_cum_preds_upper = np.concatenate( [np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_preds_upper]) expec_post_cum_preds_upper = pd.DataFrame(data=post_cum_preds_upper, index=expected_index, dtype=np.float64, columns=['post_cum_preds_upper']) pd.testing.assert_series_equal( expec_post_cum_preds_upper['post_cum_preds_upper'], inferences['post_cum_preds_upper']) # test point_effects_means net_data = pd.concat([pre_data, post_data]) expec_point_effects_means = net_data.iloc[:, 0] - inferences[ 'complete_preds_means'] expec_point_effects_means = pd.DataFrame(data=expec_point_effects_means, index=expected_index, dtype=np.float64, columns=['point_effects_means']) pd.testing.assert_series_equal( expec_point_effects_means['point_effects_means'], inferences['point_effects_means']) # test point_effects_lower expec_point_effects_lower = net_data.iloc[:, 0] - inferences[ 'complete_preds_upper'] expec_point_effects_lower = pd.DataFrame(data=expec_point_effects_lower, index=expected_index, dtype=np.float64, columns=['point_effects_lower']) pd.testing.assert_series_equal( expec_point_effects_lower['point_effects_lower'], inferences['point_effects_lower']) # test point_effects_upper expec_point_effects_upper = net_data.iloc[:, 0] - inferences[ 'complete_preds_lower'] expec_point_effects_upper = pd.DataFrame(data=expec_point_effects_upper, index=expected_index, dtype=np.float64, columns=['point_effects_upper']) pd.testing.assert_series_equal( expec_point_effects_upper['point_effects_upper'], inferences['point_effects_upper']) # test post_cum_effects_means post_effects_means = post_data.iloc[:, 0] - inferences['post_preds_means'] post_effects_means.iloc[len(pre_data) - 1] = 0 expec_post_cum_effects_means = np.cumsum(post_effects_means) expec_post_cum_effects_means = pd.DataFrame( data=expec_post_cum_effects_means, index=expected_index, dtype=np.float64, columns=['post_cum_effects_means']) pd.testing.assert_series_equal( expec_post_cum_effects_means['post_cum_effects_means'], inferences['post_cum_effects_means']) # test post_cum_effects_lower post_cum_effects_lower, post_cum_effects_upper = np.percentile(np.cumsum( post_data.iloc[:, 0].values - maybe_unstandardize(np.squeeze(posterior_dist.sample(niter)), mu_sig), axis=1), [100 * alpha / 2, 100 - 100 * alpha / 2], axis=0) post_cum_effects_lower = np.concatenate([ np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_effects_lower ]) expec_post_cum_effects_lower = pd.DataFrame( data=post_cum_effects_lower, index=expected_index, dtype=np.float64, columns=['post_cum_effects_lower']) pd.testing.assert_series_equal( expec_post_cum_effects_lower['post_cum_effects_lower'], inferences['post_cum_effects_lower']) # test post_cum_effects_upper post_cum_effects_upper = np.concatenate([ np.array([np.nan] * (len(pre_data) - 1) + [0]), post_cum_effects_upper ]) expec_post_cum_effects_upper = pd.DataFrame( data=post_cum_effects_upper, index=expected_index, dtype=np.float64, columns=['post_cum_effects_upper']) pd.testing.assert_series_equal( expec_post_cum_effects_upper['post_cum_effects_upper'], inferences['post_cum_effects_upper'])
def compile_posterior_inferences(pre_data: pd.DataFrame, post_data: pd.DataFrame, one_step_dist: tfd.Distribution, posterior_dist: tfd.Distribution, mu_sig: Optional[Tuple[float, float]], alpha: float = 0.05, niter: int = 1000) -> pd.DataFrame: """ Uses the posterior distribution of the structural time series probabilistic model to run predictions and forecasts for observed data. Results are stored for later usage on the summary and plotting functionalities. Args ---- pre_data: pd.DataFrame This is the original input data, that is, it's not standardized. post_data: pd.DataFrame Same as `pre_data`. This is the original input data, that is, it's not standardized. one_step_dist: tfd.Distribution Uses posterior parameters to run one-step-prediction on past observed data. posterior_dist: tfd.Distribution Uses posterior parameters to run forecasts on post intervention data. mu_sig: Optional[Tuple[float, float]] First value is the mean used for standardization and second value is the standard deviation. alpha: float Sets confidence interval size. niter: int Total mcmc samples to sample from the posterior structural model. Returns ------- inferences: pd.DataFrame Final dataframe with all data related to one-step predictions and forecasts. """ lower_percen, upper_percen = get_lower_upper_percentiles(alpha) z_score = get_z_score(1 - alpha / 2) # Integrates pre and post index for cumulative index data. cum_index = build_cum_index(pre_data.index, post_data.index) # We create a pd.Series with a single 0 (zero) value to work as the initial value # when computing the cumulative inferences. Without this value the plotting of # cumulative data breaks at the initial point. zero_series = pd.Series([0]) simulated_ys = posterior_dist.sample( niter) # shape (niter, n_forecasts, 1) simulated_ys = maybe_unstandardize(np.squeeze(simulated_ys.numpy()), mu_sig) # shape (niter, n_forecasts) # Pre inference pre_preds_means = one_step_dist.mean() pre_preds_stds = one_step_dist.stddev() # First points in predictions of pre-data can be quite noisy due the lack of observed # data coming before these points. We try to remove those by applying a filter that # removes all points that falls above 3 standard deviations from the 50% quantile of # the array of standard deviations for predictions, replacing those with `np.nan`. pre_preds_stds = tf.where( tf.math.greater( tf.abs(pre_preds_stds), np.quantile(pre_preds_stds, 0.5) + 3 * tf.math.reduce_std(pre_preds_stds)), np.nan, pre_preds_stds) pre_preds_lower = pd.Series(np.squeeze( maybe_unstandardize(pre_preds_means - z_score * pre_preds_stds, mu_sig)), index=pre_data.index) pre_preds_upper = pd.Series(np.squeeze( maybe_unstandardize(pre_preds_means + z_score * pre_preds_stds, mu_sig)), index=pre_data.index) pre_preds_means = pd.Series(np.squeeze( maybe_unstandardize(pre_preds_means, mu_sig)), index=pre_data.index) # Post inference post_preds_means = posterior_dist.mean() post_preds_stds = posterior_dist.stddev() post_preds_lower = pd.Series(np.squeeze( maybe_unstandardize(post_preds_means - z_score * post_preds_stds, mu_sig)), index=post_data.index) post_preds_upper = pd.Series(np.squeeze( maybe_unstandardize(post_preds_means + z_score * post_preds_stds, mu_sig)), index=post_data.index) post_preds_means = pd.Series(np.squeeze( maybe_unstandardize(post_preds_means, mu_sig)), index=post_data.index) # Concatenations complete_preds_means = pd.concat([pre_preds_means, post_preds_means]) complete_preds_lower = pd.concat([pre_preds_lower, post_preds_lower]) complete_preds_upper = pd.concat([pre_preds_upper, post_preds_upper]) # Cumulative post_cum_y = np.cumsum(post_data.iloc[:, 0]) post_cum_y = pd.concat([zero_series, post_cum_y], axis=0) post_cum_y.index = cum_index post_cum_preds_means = np.cumsum(post_preds_means) post_cum_preds_means = pd.concat([zero_series, post_cum_preds_means]) post_cum_preds_means.index = cum_index post_cum_preds_lower, post_cum_preds_upper = np.percentile(np.cumsum( simulated_ys, axis=1), [lower_percen, upper_percen], axis=0) # Sets index properly post_cum_preds_lower = pd.Series(np.squeeze( np.concatenate([[0], post_cum_preds_lower])), index=cum_index) post_cum_preds_upper = pd.Series(np.squeeze( np.concatenate([[0], post_cum_preds_upper])), index=cum_index) # Using a net value of data to accomodate cases where there're gaps between pre # and post intervention periods. net_data = pd.concat([pre_data, post_data]) # Point effects point_effects_means = net_data.iloc[:, 0] - complete_preds_means point_effects_upper = net_data.iloc[:, 0] - complete_preds_lower point_effects_lower = net_data.iloc[:, 0] - complete_preds_upper post_point_effects_means = post_data.iloc[:, 0] - post_preds_means # Cumulative point effects analysis post_cum_effects_means = np.cumsum(post_point_effects_means) post_cum_effects_means = pd.concat([zero_series, post_cum_effects_means]) post_cum_effects_means.index = cum_index post_cum_effects_lower, post_cum_effects_upper = np.percentile( np.cumsum(post_data.iloc[:, 0].values - simulated_ys, axis=1), [lower_percen, upper_percen], axis=0) # Sets index properly. post_cum_effects_lower = pd.Series(np.squeeze( np.concatenate([[0], post_cum_effects_lower])), index=cum_index) post_cum_effects_upper = pd.Series(np.squeeze( np.concatenate([[0], post_cum_effects_upper])), index=cum_index) inferences = pd.concat([ complete_preds_means, complete_preds_lower, complete_preds_upper, post_preds_means, post_preds_lower, post_preds_upper, post_cum_y, post_cum_preds_means, post_cum_preds_lower, post_cum_preds_upper, point_effects_means, point_effects_lower, point_effects_upper, post_cum_effects_means, post_cum_effects_lower, post_cum_effects_upper ], axis=1) inferences.columns = [ 'complete_preds_means', 'complete_preds_lower', 'complete_preds_upper', 'post_preds_means', 'post_preds_lower', 'post_preds_upper', 'post_cum_y', 'post_cum_preds_means', 'post_cum_preds_lower', 'post_cum_preds_upper', 'point_effects_means', 'point_effects_lower', 'point_effects_upper', 'post_cum_effects_means', 'post_cum_effects_lower', 'post_cum_effects_upper' ] return inferences
def _compile_posterior_inferences(self): """ Runs the posterior causal impact inference computation using the already trained model. Args ---- self: trained_model: `UnobservedComponentsResultsWrapper`. pre_data: pandas DataFrame. post_data: pandas DataFrame. alpha: float. mu_sig: tuple. First value is the mean used for standardization and second value is the standard deviation. """ lower, upper = self.lower_upper_percentile exog = self.post_data if self.mu_sig is None else self.normed_post_data zero_series = pd.Series([0]) # We do exactly as in statsmodels for past predictions: # https://github.com/statsmodels/statsmodels/blob/v0.9.0/statsmodels/tsa/statespace/structural.py predict = self.trained_model.filter_results.forecasts[0] std_errors = np.sqrt( self.trained_model.filter_results.forecasts_error_cov[0, 0]) critical_value = get_z_score(1 - self.alpha / 2.) pre_preds_lower = pd.Series( self._unstardardize(predict - critical_value * std_errors), index=self.pre_data.index) pre_preds_upper = pd.Series( self._unstardardize(predict + critical_value * std_errors), index=self.pre_data.index) post_predictor = self.trained_model.get_forecast(steps=len( self.post_data), exog=exog.iloc[:, 1:], alpha=self.alpha) pre_preds = pd.Series(self._unstardardize(predict), index=self.pre_data.index) post_preds = self._unstardardize(post_predictor.predicted_mean) # Sets index properly. post_preds.index = self.post_data.index # Confidence Intervals. post_ci = self._unstardardize( post_predictor.conf_int(alpha=self.alpha)) post_preds_lower = post_ci.iloc[:, 0] post_preds_upper = post_ci.iloc[:, 1] # Sets index properly. post_preds_lower.index = self.post_data.index post_preds_upper.index = self.post_data.index # Concatenations. preds = pd.concat([pre_preds, post_preds]) preds_lower = pd.concat([pre_preds_lower, post_preds_lower]) preds_upper = pd.concat([pre_preds_upper, post_preds_upper]) # Cumulative analysis. post_cum_y = np.cumsum(self.post_data.iloc[:, 0]) post_cum_y = pd.concat([zero_series, post_cum_y], axis=0) post_cum_y.index = self.get_cum_index() post_cum_pred = np.cumsum(post_preds) post_cum_pred = pd.concat([zero_series, post_cum_pred]) post_cum_pred.index = self.get_cum_index() post_cum_pred_lower, post_cum_pred_upper = np.percentile(np.cumsum( self.simulated_y, axis=1), [lower, upper], axis=0) # Sets index properly. post_cum_pred_lower = pd.Series(np.concatenate([[0], post_cum_pred_lower]), index=self.get_cum_index()) post_cum_pred_upper = pd.Series(np.concatenate([[0], post_cum_pred_upper]), index=self.get_cum_index()) # Using a net value of data to accomodate cases where there's gaps between # pre and post intervention periods. net_data = pd.concat([self.pre_data, self.post_data]) # Effects analysis. point_effects = net_data.iloc[:, 0] - preds point_effects_lower = net_data.iloc[:, 0] - preds_upper point_effects_upper = net_data.iloc[:, 0] - preds_lower post_point_effects = self.post_data.iloc[:, 0] - post_preds # Cumulative Effects analysis. post_cum_effects = np.cumsum(post_point_effects) post_cum_effects = pd.concat([zero_series, post_cum_effects]) post_cum_effects.index = self.get_cum_index() post_cum_effects_lower, post_cum_effects_upper = np.percentile( np.cumsum(self.post_data.iloc[:, 0].values - self.simulated_y, axis=1), [lower, upper], axis=0) # Sets index properly. post_cum_effects_lower = pd.Series(np.concatenate( [[0], post_cum_effects_lower]), index=self.get_cum_index()) post_cum_effects_upper = pd.Series(np.concatenate( [[0], post_cum_effects_upper]), index=self.get_cum_index()) self.inferences = pd.concat([ post_cum_y, preds, post_preds, post_preds_lower, post_preds_upper, preds_lower, preds_upper, post_cum_pred, post_cum_pred_lower, post_cum_pred_upper, point_effects, point_effects_lower, point_effects_upper, post_cum_effects, post_cum_effects_lower, post_cum_effects_upper ], axis=1) self.inferences.columns = [ 'post_cum_y', 'preds', 'post_preds', 'post_preds_lower', 'post_preds_upper', 'preds_lower', 'preds_upper', 'post_cum_pred', 'post_cum_pred_lower', 'post_cum_pred_upper', 'point_effects', 'point_effects_lower', 'point_effects_upper', 'post_cum_effects', 'post_cum_effects_lower', 'post_cum_effects_upper' ]