def boom(self): if hasattr(self, "_boom_holiday"): return self._boom_holiday start_days = [R.to_boom_date(x) for x in self._start] end_days = [R.to_boom_date(x) for x in self._end] return boom.DateRangeHoliday(start_days, end_days)
def plot_state(self, burn=None, time=None, show_actuals=True, style=None, scale=None, ylim=None, ax=None, **kwargs): if style is None: style = "dynamic" style = R.unique_match(style, ["dynamic", "boxplot"]) if scale is None: scale = "linear" scale = R.unique_match(scale, ["linear", "mean"]) niter = self._niter if burn is None: burn = self.suggest_burn() if time is None: time = self.original_series.index state_contribution = np.zeros((niter, len(time))) for model in self._state_models: state_contribution += model.state_contribution R.plot_dynamic_distribution( curves=state_contribution, timestamps=time, ax=ax, ylim=ylim, **kwargs)
def test_numerics(self): numeric_df = pd.DataFrame(np.random.randn(10, 3)) self.assertTrue(R.is_all_numeric(numeric_df)) non_numeric = numeric_df.copy() non_numeric["text"] = "foo" self.assertFalse(R.is_all_numeric(non_numeric))
def _set_posterior_sampler(self, y, level_sigma_prior, slope_sigma_prior, sdy): """ A utility called by the constructor. See the __init__ method for argument documentation. """ if level_sigma_prior is None: sdy = self._compute_sdy(sdy, y, "level_sigma_prior") level_sigma_prior = R.SdPrior(sigma_guess=.01 * sdy, upper_limit=sdy) if not isinstance(level_sigma_prior, R.SdPrior): raise Exception("Unexpected type for level_sigma_prior.") if slope_sigma_prior is None: sdy = self._compute_sdy(sdy, y, "slope_sigma_prior") slope_sigma_prior = R.SdPrior(sigma_guess=0.1 * sdy, upper_limit=sdy) if not isinstance(slope_sigma_prior, R.SdPrior): raise Exception("Unexpected type for slope_sigma_prior.") self._state_model.set_posterior_sampler( level_sigma_prior.create_chisq_model(), level_sigma_prior.upper_limit, slope_sigma_prior.create_chisq_model(), slope_sigma_prior.upper_limit, boom.GlobalRng.rng)
def _validate_priors(self, level_sigma_prior, level_nu_prior, slope_sigma_prior, slope_nu_prior, y, sdy): if level_sigma_prior is None: sdy = self._compute_sdy(sdy, y, "level_sigma_prior") level_sigma_prior = R.SdPrior( sigma_guess=.01 * sdy, upper_limit=sdy) if not isinstance(level_sigma_prior, R.SdPrior): raise Exception("Unexpected type for level_sigma_prior.") if slope_sigma_prior is None: sdy = self._compute_sdy(sdy, y, "slope_sigma_prior") slope_sigma_prior = R.SdPrior( sigma_guess=.01 * sdy, upper_limit=sdy) if not isinstance(slope_sigma_prior, R.SdPrior): raise Exception("Unexpected type for slope_sigma_prior.") if level_nu_prior is None: level_nu_prior = R.UniformPrior(0.1, 100) if not isinstance(level_nu_prior, R.DoubleModel): raise Exception("Unexpected type for level_nu_prior.") if slope_nu_prior is None: slope_nu_prior = R.UniformPrior(0.1, 100) if not isinstance(slope_nu_prior, R.DoubleModel): raise Exception("Unexpected type for slope_nu_prior.") self._level_sigma_prior = level_sigma_prior self._slope_sigma_prior = slope_sigma_prior self._level_nu_prior = level_nu_prior self._slope_nu_prior = slope_nu_prior
def _verify_initial_state_prior(self, initial_state_prior, xtx, xty, sdy): if initial_state_prior is None: try: beta_hat = np.linalg.solve(xtx, xty) if not np.all(np.finite(beta_hat)): raise Exception("Least squares initializer failed.") self._initial_state_prior = R.MvnPrior( beta_hat, sdy * sdy * np.linalg.inv(xtx)) except Exception: self._initial_state_prior = R.MvnPrior( np.zeros(self.xdim), sdy * sdy * np.diag(1.0 / np.diagonal(xtx))) elif isinstance(initial_state_prior, R.NormalPrior): mean = np.full(initial_state_prior.mean, self.xdim) var = np.full(initial_state_prior.sd**2, self.xdim) self._initial_state_prior = R.MvnPrior(mean, np.diag(var)) elif isinstance(initial_state_prior, list) and all( [isinstance(x, R.NormalPrior) for x in initial_state_prior]): mean = np.array([x.mean for x in initial_state_prior]) var = np.array([x.sd**2 for x in initial_state_prior]) self._initial_state_prior = R.MvnPrior(mean, np.diag(var)) else: if not isinstance(initial_state_prior, R.MvnPrior): raise Exception("Unrecognized type for initial_state_prior.") self._initial_state_prior = initial_state_prior return self._initial_state_prior
def plot_single_coefficient(self, beta, ylim=None, ax=None, highlight_median="green"): """ Plot the dynamic distribution of a single model coefficient. Args: beta: The coefficient to be plotted. A matrix. Rows are Monte Carlo draws, and columns are time points. ylim: A pair of numbers giving the lower and upper limits of the Y axis. If 'None' then 'ylim' will be inferred from the range of the data. ax: A plt.Axes object on which to draw. If None then a new plt.Figure and Axes will be created and drawn on function exit. highlight_median: The name of a color used to draw the meadian of the curves at each time point. The empty string signals not to add the extra highlighting. Returns: The axes object containing the plot. """ fig = None if ax is None: fig, ax = plt.subplots(1, 1) R.plot_dynamic_distribution(beta, timestamps=self._unique_timestamps, ax=ax, ylim=ylim, highlight_median=highlight_median) if fig is not None: fig.show() return ax
def create_base_dataset(self, sample_size, num_numeric, num_cat, num_levels): xdim = 1 + num_cat * (num_levels - 1) cats = {} levels = {} encoders = [] for i in range(num_cat): # local_levels are the levels this variable can assume. local_levels = random_words(num_levels) vname = "cat" + str(i + 1) values = np.random.choice(local_levels, sample_size) cats[vname] = values levels[vname] = local_levels encoders.append(boom.EffectsEncoder(i, local_levels)) ydim = num_numeric self._beta = np.random.randn(xdim, ydim) Rho = boom.random_correlation_matrix(ydim).to_numpy() S = np.diag(R.rgamma(ydim, 1, 1)) self._Sigma = S @ Rho @ S Sigma_root = np.linalg.cholesky(self._Sigma) errors = (Sigma_root @ np.random.randn(ydim, sample_size)).T encoder = boom.DatasetEncoder(encoders) xcat = encoder.encode_dataset(R.to_data_table(pd.DataFrame(cats))) yhat = xcat.to_numpy() @ self._beta numerics = yhat + errors self._data = pd.DataFrame( numerics, columns=["X" + str(i + 1) for i in range(ydim)]) for vname, column in cats.items(): self._data[vname] = column self._ydim = ydim self._xdim = xdim self._ncat = num_cat
def test_data_table(self): table = R.to_data_table(self._data) self.assertEqual(table.nrow, self._data.shape[0]) self.assertEqual(table.ncol, self._data.shape[1]) frame = R.to_data_frame(table) for i in range(5): self.assertTrue(np.all(self._data.iloc[:, i] == frame.iloc[:, i]))
def test_conversions(self): x = [1, 2, 3] v = R.to_boom_vector(x) self.assertIsInstance(v, boom.Vector) x = pd.Series(x, dtype="int") v = R.to_boom_vector(x) self.assertIsInstance(v, boom.Vector)
def plot_residual_sd(self, burn: int = None, type: str = "density", ax=None, **kwargs): """ Args: burn: The number of MCMC iterations to discard as burn-in. "None" indicates that an estimated default number should be used. type: The type of plot. "density" shows a kernel density estimate of the residual SD draws. "ts" shows a time series plot of the draws. ax: A plt.Axes object on which to draw the plot. If None new Figure and Axes objects are created and drawn on function exit. kwargs: Further keyword arguments are ignored. Effects: A plot is added to the relevant Axes object. Returns: The Axes object on which the plot is drawn. """ plot_types = ["density", "ts"] type = R.unique_match(type, plot_types) if burn is None: burn = self.suggest_burn() if burn < 0: burn = 0 sd = self._residual_sd_draws[burn:] show_plot = False if ax is None: fig, ax = plt.subplots(1, 1) show_plot = True if type == "density": density = R.Density(sd) density.plot(ax=ax, xlab="Residual SD", ylab="Density") elif type == "ts": iteration = np.arange(len(self._residual_sd_draws)) if burn > 0: iteration = iteration[burn:] ax.plot(iteration, sd) ax.set_xlabel("Iteration") ax.set_ylabel("Residual SD") if show_plot: fig.show() return ax
def _verify_prior(self, sigma_prior, sdy, sdx): if sigma_prior is None: self._sigma_prior = [ R.SdPrior(.01 * sdy / sdxi, 1) for sdxi in sdx ] elif isinstance(sigma_prior, R.SdPrior): self._sigma_prior = [sigma_prior] * len(sdx) if not R.is_iterable(self._sigma_prior) and all( [isinstance(x, R.SdPrior) for x in self._sigma_prior]): raise Exception( "sigma_prior must be a list-like of R.SdPrior objects.") return self._sigma_prior
def test_paste(self): foo = R.paste("X", [1, 2, 3]) self.assertEqual(foo, ["X 1", "X 2", "X 3"]) bar = R.paste("X", [1, 2, 3], sep="") self.assertEqual(bar, ["X1", "X2", "X3"]) baz = R.paste([1, 2, "X"], [4, 5, 6]) self.assertEqual(baz, ["1 4", "2 5", "X 6"]) foo = R.paste("X", pd.Series([1, 2, 3]), sep="") self.assertEqual(foo, ["X1", "X2", "X3"]) f = R.paste("X", [1, 2, 3], sep="", collapse=" ") self.assertEqual(f, "X1 X2 X3")
def create_model(self, prior: R.SdPrior, data: pd.Series): """ Args: prior: an R.SdPrior object describing the prior distribution on the residual variance paramter. data: The time series of observations as a Pandas Series. Returns: A boom.StateSpaceModel object. """ boom_data = boom.Vector(data.values) is_observed = ~data.isna() self._model = boom.StateSpaceModel(boom_data, is_observed) if prior is None: sdy = np.std(data) prior = R.SdPrior(sigma_guess=sdy, upper_limit=sdy * 1.2) boom_prior = boom.ChisqModel(prior.sample_size, prior.sigma_guess) observation_model_sampler = boom.ZeroMeanGaussianConjSampler( self._model.observation_model, boom_prior) observation_model_sampler.set_sigma_upper_limit( prior.upper_limit) self._model.observation_model.set_method(observation_model_sampler) sampler = boom.StateSpacePosteriorSampler( self._model, boom.GlobalRng.rng) self._model.set_method(sampler) self._original_series = data return self._model
def _validate_slope_mean_prior(slope_mean_prior, sdy): if slope_mean_prior is None: slope_mean_prior = R.NormalPrior(0, sdy) if not isinstance(slope_mean_prior, R.NormalPrior): raise Exception("Wrong type passed for slope_mean_prior. " "Expected an R.NormalPrior") return slope_mean_prior
def _validate_slope_ar1_prior(slope_ar1_prior, sdy): if slope_ar1_prior is None: slope_ar1_prior = R.Ar1CoefficientPrior() if not isinstance(slope_ar1_prior, R.Ar1CoefficientPrior): raise Exception("Wrong type passed for slope_ar1_prior. " "Expected an R.Ar1CoefficientPrior") return slope_ar1_prior
def test_draw_inclusion_indicators(self): """ Check that the model draws the inclusion indicators conditional on all other unknowns fixed at their true values. The regression coefficients are integrated out and not conditioned on. """ # Make the coefficients big, so that effects are obvious. unscaled_innovation_sd = np.array([10, 20, 30]) data, coefficients, inclusion = self.simulate_data_from_model( time_dimension=100, typical_sample_size=500, xdim=self._xdim, residual_sd=self._residual_sd, unscaled_innovation_sd=unscaled_innovation_sd, p00=self._p00, p11=self._p11) model, sampler = self.setup_model(data, coefficients, inclusion, self._residual_sd, unscaled_innovation_sd, self._p00, self._p11) niter = 1000 draws = np.full((niter, model.xdim, model.time_dimension), -1) for i in range(niter): sampler.draw_inclusion_indicators() draws[i, :, :] = model.inclusion_indicators.to_numpy() posterior_mean = np.mean(draws[100:, :], axis=0) mean_vector = posterior_mean.flatten() inclusion_vector = inclusion.flatten() cor = R.corr(inclusion_vector, mean_vector) self.assertGreater(cor, .6)
def test_encode_dataset(self): data = pd.DataFrame(np.random.randn(3, 2), columns=["X1", "X2"]) data["Color"] = ["Red", "Blue", "Green"] encoder = R.EffectEncoder("Color", ["Red", "Blue", "Green"]) enc = encoder.encode_dataset(data) expected = np.array([[1.0, 0.0], [0.0, 1.0], [-1.0, -1.0]]) self.assertTrue(np.allclose(enc, expected))
def _validate_coefficient_innovation_priors(self): """ Ensure that self._coefficient_innovation_priors are a list of SdPriors. """ if (isinstance(self._coefficient_innovation_priors, list) and np.all([ isinstance(x, R.SdPrior) for x in self._coefficient_innovation_priors ])): return if isinstance(self._coefficient_innovation_priors, R.SdPrior): self._coefficient_innovation_priors = [ self._coefficient_innovation_priors ] * self.xdim return if self._coefficient_innovation_priors is not None: raise Exception("coefficient_innovation_priors must either be an " "R.SdPrior or a list of such priors.") sdy = self._response_suf.sample_sd self._coefficient_innovation_priors = [ R.SdPrior(.01 * sdy / self._predictor_suf[i].sample_sd, 1) for i in range(self.xdim) ]
def _validate_slope_sigma_prior(slope_sigma_prior, sdy): if slope_sigma_prior is None: slope_sigma_prior = R.SdPrior(.01 * sdy, upper_limit=sdy) if not isinstance(slope_sigma_prior, R.SdPrior): raise Exception("Wrong type passed for slope_sigma_prior. " "Expected an R.SdPrior") return slope_sigma_prior
def test_draw_coefficients(self): # Make the coefficients big, so that effects are obvious. unscaled_innovation_sd = np.array([10, 20, 30]) data, coefficients, inclusion = self.simulate_data_from_model( time_dimension=100, typical_sample_size=500, xdim=self._xdim, residual_sd=self._residual_sd, unscaled_innovation_sd=unscaled_innovation_sd, p00=self._p00, p11=self._p11) model, _ = self.setup_model(data, coefficients, inclusion, self._residual_sd, unscaled_innovation_sd, self._p00, self._p11) niter = 1000 draws = np.full((niter, model.xdim, model.time_dimension), np.NaN) for i in range(niter): model.draw_coefficients_given_inclusion(boom.GlobalRng.rng) draws[i, :, :] = model.all_coefficients.to_numpy() posterior_mean = np.mean(draws, axis=0) mean_vector = posterior_mean.flatten() beta_vector = coefficients.flatten() cor = R.corr(mean_vector, beta_vector) self.assertGreater(cor, .9)
def _default_initial_state_prior(self, sdy): """ The default prior to use for the initial state vector. """ dim = self.nseasons - 1 return R.MvnPrior(np.zeros(dim), np.diag(np.full(dim, float(sdy))))
def plot_inclusion(self, burn=None, inclusion_threshold=0, unit_scale=True, number_of_variables=None, ax=None, **kwargs): """A barplot showing the marginal inclusion probability of each variable. """ inc = self.inclusion_probs(burn=burn) pos = self.coefficient_positive_probability(burn=burn) colors = np.array([str(x) for x in pos]) index = np.argsort(inc.values)[::-1] if number_of_variables is None: number_of_variables = np.sum(inc >= inclusion_threshold) inc = inc[index[:number_of_variables]] pos = pos[index[:number_of_variables]] colors = colors[index[:number_of_variables]] foo = R.barplot(inc, ax=ax, color=colors[::-1], linewidth=.25, edgecolor="black", xlab="Marginal Inclusion Probability", ylab="Variable", **kwargs) return foo
def plot(self, what=None, **kwargs): """Plot an aspect of the model. Args: what: The type of plot desired. Acceptable choices are "inclusion", "coefficients", "residual", and "predicted". kwargs: Extra arguments are passed to the specific plot function being called. """ plot_types = ["inclusion", "coefficients", "residual", "predicted"] if what is None: what = plot_types[0] what = R.unique_match(what, plot_types) if what == "coefficients": return self.plot_coefficients(**kwargs) elif what == "inclusion": return self.plot_inclusion(**kwargs) elif what == "residual": return self.plot_residual(**kwargs) elif what == "predicted": return self.plot_predicted(**kwargs) else: raise Exception(f"Unknown plot type {what}.")
def _validate_initial_level_prior(initial_level_prior, initial_y, sdy): if initial_level_prior is None: initial_level_prior = R.NormalPrior(initial_y, sdy) if not isinstance(initial_level_prior, R.NormalPrior): raise Exception("Wrong type for initial_level_prior. " "Expected an R.NormalPrior.") return initial_level_prior
def plot_inclusion_probs(coefficients, burn, xnames, inclusion_threshold=0, unit_scale=True, number_of_variables=None, ax=None, **kwargs): """ """ coef = coefficients[burn:, :] inc = compute_inclusion_probabilities(coef) pos = coefficient_positive_probability(coef) colors = np.array([str(x) for x in pos]) index = np.argsort(inc.values)[::-1] if number_of_variables is None: number_of_variables = np.sum(inc >= inclusion_threshold) inc = inc[index[:number_of_variables]] pos = pos[index[:number_of_variables]] colors = colors[index[:number_of_variables]] ans = R.barplot(inc, ax=ax, color=colors[::-1], linewidth=.25, edgecolor="black", xlab="Marginal Inclusion Probability", ylab="Variable", **kwargs) return ans
def test_lty(self): x = np.linspace(0, 10) fig, ax = plt.subplots() for i in range(10): y = x + i ax.plot(x, y, ls=R.lty(i)) if _show_figs: fig.show()
def _build_state_model(self): self._state_model = boom.DynamicRegressionStateModel( R.to_boom_matrix(self._predictors)) boom_sigma_priors = [pri.boom() for pri in self._sigma_prior] state_model_sampler = boom.DynamicRegressionIndependentPosteriorSampler( self._state_model, boom_sigma_priors) for i, prior in enumerate(self._sigma_prior): finite_limit = np.isfinite(prior.upper_limit) if prior.upper_limit > 0 and finite_limit: state_model_sampler.set_sigma_max(i, prior.upper_limit) self._state_model.set_method(state_model_sampler) self._state_model.set_initial_state_mean( R.to_boom_vector(self._initial_state_prior.mean)) self._state_model.set_initial_state_variance( R.to_boom_spd(self._initial_state_prior.Sigma))
def test_encoding(self): enc1 = R.EffectEncoder("Color", ["Red", "Blue"]) enc2 = R.IdentityEncoder("Height") enc3 = R.InteractionEncoder(enc1, enc2) encoder = R.DatasetEncoder([enc1, enc2, enc3]) sample_size = 1000 data = pd.DataFrame({ "Height": np.random.randn(sample_size), "Color": np.random.choice(["Red", "Blue"], sample_size) }) enc = encoder.encode_dataset(data) self.assertEqual(sample_size, enc.shape[0]) self.assertEqual(4, enc.shape[1]) self.assertTrue(np.allclose(enc[:, 2], data.iloc[:, 0]))
def plot_size(self, ax=None, burn: int = None, **kwargs): fig = None if ax is None: fig, ax = plt.subplots(1, 1) size = np.sum(self._beta_draws != 0, axis=1) R.plot_dynamic_distribution( size, timestamps=self._unique_timestamps, ax=ax, xlab="Time", ylab="Number Included Predictors", ) if fig is not None: fig.show() return ax