Esempio n. 1
0
    def _set_prior(self):
        """
        Set the prior distribution on the imputation model, and assign a
        PosteriorSampler.

        A nearly noninformative prior is chosen for the
        residual variance matrix and regression coefficients.  Each cell in in
        the mixture model is assigned an identical prior.
        """
        if self._model is None:
            raise Exception(
                "_set_prior was called before the model was created.")

        self._set_regression_prior()
        self._set_mixing_distribution_prior()
        for i, vname in enumerate(self._numeric_colnames):
            if vname not in self._atom_prior.keys():
                self._atom_prior[vname] = self._default_atom_prior(
                    self._atoms[vname])
            self._model.set_atom_prior(
                boom.Vector(self._atom_prior[vname].astype("float")),
                i)

        for i, vname in enumerate(self._categorical_colnames):
            if vname not in self._level_prior.keys():
                self._level_prior[vname] = self._default_level_prior(
                    self._levels[vname])
            self._model.set_level_prior(
                boom.Vector(self._level_prior[vname].astype("float")),
                i)
Esempio n. 2
0
    def _restore_parameters(self, iteration: int):
        """
        Restore the state of the model to a specific MCMC iteration.
        """
        self._model.set_coefficients(
            boom.Matrix(self.coefficients[iteration, :, :]))
        self._model.set_residual_variance(
            boom.SpdMatrix(self.residual_variance[iteration, :, :]))
        for cluster in range(self.nclusters):
            for col in range(len(self._numeric_colnames)):
                vname = self._numeric_colnames[col]
                self._model.set_atom_probs(
                    cluster, col,
                    boom.Vector(self.atom_probs[vname][iteration, cluster, :]))
                self._model.set_atom_error_probs(
                    cluster, col,
                    boom.Matrix(self.atom_error_probs[vname][iteration,
                                                             cluster, :, :]))

            for col in range(len(self._categorical_colnames)):
                vname = self._categorical_colnames[col]
                self._model.set_level_probs(
                    cluster, col,
                    boom.Vector(self.level_probs[vname][iteration,
                                                        cluster, :]))
                self._model.set_level_observation_probs(
                    cluster, col,
                    boom.Matrix(
                        self.level_observation_probs[vname][iteration,
                                                            cluster, :, :]))
Esempio n. 3
0
    def create_model(self, prior, data, **kwargs):
        if data is not None:
            response, predictors = patsy.dmatrices(self._formula, data)
            self.predictor_names = predictors.design_info.term_names
            extra_args = {**kwargs}
            trials = extra_args.get("trials", 1)
            if isinstance(trials, Number):
                trials = np.full(len(response), trials)
            observed = np.isfinite(response)
            self._model = boom.StateSpacePoissonModel(boom.Vector(response),
                                                      boom.Vector(trials),
                                                      boom.Matrix(predictors),
                                                      observed)
        elif prior is not None:
            xdim = len(prior._prior_inclusion_probabilities)
            self._model = boom.StateSpaceLogitModel(xdim)
            response = None
            predictors = None
            trials = None
        else:
            raise Exception("At least one of 'data' or 'prior' is needed.")

        logit_reg = self._model.observation_model
        prior = self._verify_prior(prior, response, predictors, trials,
                                   **kwargs)
        self._prior = prior
        observation_model_sampler = prior.create_sampler(logit_reg,
                                                         assign=True)

        sampler = boom.StateSpacePoissonPosteriorSampler(
            self._model, observation_model_sampler)
        self._model.set_method(sampler)
        self._original_series = response
        return self._model
Esempio n. 4
0
 def test_parameters(self):
     """When parameters are modified outside the object, the object properties
      should change.  This is testing that pointers are being stored.
      """
     zeros = boom.Vector(np.array([0, 0, 0]))
     model = boom.MvnModel(zeros, self.Sigma)
     mu_prm = model.mean_parameter
     new_mu = boom.Vector(np.array([3.0, 2.0, 1.0]))
     mu_prm.set(new_mu)
     self.assertLess((model.mu - new_mu).normsq(), 1e-5)
Esempio n. 5
0
 def _format_imputation_data(self, data):
     dummies = self.encode(data)
     numerics = data.loc[:, self._numeric_colnames]
     formatted_data = []
     for i in range(data.shape[0]):
         y = boom.Vector(
             numerics.iloc[i, :].values.flatten().astype("float"))
         x = boom.Vector(dummies[i, :].flatten().astype("float"))
         formatted_data.append(boom.MvRegData(y, x))
     return formatted_data
Esempio n. 6
0
    def test_vector(self):
        v = boom.Vector(np.full(3, -2.8))
        self.assertEqual(v.size, 3)
        vn = v.to_numpy()
        self.assertTrue(np.array_equal(vn, np.array([-2.8, -2.8, -2.8])))

        v1 = boom.Vector(np.array([1.0, 2, 3]))
        v2 = boom.Vector(np.array([3.0, 2, 1]))
        v3 = v1 / v2
        self.assertTrue(
            np.array_equal(v3.to_numpy(), np.array([1.0 / 3, 1.0, 3.0])))
        self.assertEqual(len(v), v.length)
Esempio n. 7
0
    def stream_data_for_initial_screen(self, x: np.ndarray, y: np.ndarray):
        """
        Pass the data to the underlying C++ model object for the purpose of
        running an initial screen.

        Arg:
          x: Matrix of predictor variables.  If an intercept term is desired it
            should be present in the first column.  Any dummy variables and
            basis expansions (e.g. splines) should already be included.
          y:  The response vector.
        """
        for i, yi in enumerate(y):
            data_point = boom.RegressionData(yi, boom.Vector(x[i, :]))
            self._sampler.stream_data_for_initial_screen(data_point)
            self._response_suf.increment(boom.Vector(y))
Esempio n. 8
0
def check_stochastic_process(draws: np.ndarray,
                             truth: np.ndarray,
                             confidence: float = .95,
                             sd_ratio_threshold: float = .1,
                             control_multiple_comparisons: bool = True):
    """
    Args:
      draws: A matrix of Monte Carlo draws to be checked.  Each row is a draw
        and each column is a variable.
      truth: A vector of true values against which draws will be compared.
        truth.size() must match ncol(draws).
      confidence: The confidence associated with the marginal posterior
        intervals used to determine coverage.
      sd_ratio_threshold: One of the testing diagnostics compares the standard
        deviation of the centered draws to the standard deviation of the true
        function.  If that ratio is less than this threshold the diagnostic is
        passed.

    Returns:
      A string containing an error message describing the mode of the failure
      to cover.
    """
    import BayesBoom.boom as boom
    return boom.check_stochastic_process(boom.Matrix(draws),
                                         boom.Vector(truth), float(confidence),
                                         float(sd_ratio_threshold),
                                         bool(control_multiple_comparisons))
Esempio n. 9
0
 def test_vector_view(self):
     v = boom.Vector(np.array([1.0, 2.0, -3.0]))
     vv = boom.VectorView(v)
     vv[0] = -0.1
     self.assertEqual(v[0], vv[0])
     vv /= 2.0
     self.assertEqual(v[1], 1.0)
Esempio n. 10
0
 def boom(self):
     """
     Return the boom.MvnModel corresponding to this object's parameters.
     """
     import BayesBoom.boom as boom
     return boom.MvnModel(boom.Vector(self._mu),
                          boom.SpdMatrix(self._Sigma))
Esempio n. 11
0
    def test_matrix(self):
        y = np.random.randn(10000, 3)
        Sigma = boom.SpdMatrix(
            np.array([[1, .8, -.6], [.8, 2, -.8], [-.6, -.8, 4]]))
        chol = boom.Cholesky(Sigma)
        R = chol.getLT()
        y = y @ R.to_numpy()
        mu = np.array([1, 2, -3])
        y = y + mu
        mu = boom.Vector(mu)
        y = boom.Matrix(y)
        meany = mean(y)
        self.assertLess((meany - mu).normsq(), .01)

        V = var(y)
        self.assertLess((V.diag() - Sigma.diag()).normsq(), .05)

        R = cor(y)
        Rtrue = Sigma.to_numpy()
        for i in range(3):
            for j in range(3):
                Rtrue[i, j] = Sigma[i, j] / np.sqrt(Sigma[i, i] * Sigma[j, j])
        Rtrue = boom.SpdMatrix(Rtrue)

        self.assertLess((Rtrue - R).max_abs(), .01)
Esempio n. 12
0
    def test_bspline(self):
        spline = boom.Bspline(boom.Vector(self.knots))
        scalar = 2.1
        basis = spline.basis(scalar)
        self.assertTrue(isinstance(basis, boom.Vector))

        self.assertEqual(spline.degree, 3)
        self.assertEqual(spline.order, 4)
        self.assertTrue(isinstance(spline.knots(), boom.Vector))
        self.assertEqual(spline.knots().size, 3)
        self.assertTrue(
            np.allclose(spline.knots().to_numpy(), np.array([1.0, 2.0, 3.0])))
        self.assertEqual(spline.dim, 3 - 1 + 3)
        self.assertEqual(spline.dim, basis.size)

        expected = np.array([
            0, 0.18224999999999994, 0.48599999999999999, 0.3307500000000001,
            0.0010000000000000026
        ])
        self.assertTrue(np.allclose(basis.to_numpy(), expected))

        basis_matrix = spline.basis_matrix(np.array([2.1, 2.1]))
        ra = basis_matrix.to_numpy()
        self.assertTrue(np.allclose(ra[0, :], expected))
        self.assertTrue(np.allclose(ra[1, :], expected))
Esempio n. 13
0
    def __setstate__(self, state):
        """
        Retrieve a MixedDataImputer from a pickle.
        """
        self._atoms = state["atoms"]
        self._numeric_colnames = state["numeric_colnames"]
        self._categorical_colnames = state["categorical_colnames"]
        self._atom_prior = state["atom_prior"]
        self._dataset_encoder = state["dataset_encoder"]
        self.coefficient_draws = state["coefficient_draws"]
        self.residual_variance_draws = state["residual_variance_draws"]
        self.atom_probs = state["atom_probs"]
        self.empirical_distributions = state["empirical_distributions"]

        if (
                self._dataset_encoder is not None
                and state["nclusters"] is not None
        ):
            xdim = self._encoder.dim
            atom_vector = []
            for vname in self._numeric_colnames:
                atoms = np.array(self._atoms[vname])
                atom_vector.append(boom.Vector(
                    atoms.flatten().astype("float")))
            self._model = boom.MixedDataImputer(
                # TODO: this is hosed.
            )
            state["nclusters"], atom_vector, xdim
            self._model.set_empirical_distributions(
                state["empirical_distributions"])
Esempio n. 14
0
    def create_model(self, prior: R.SdPrior, data: pd.Series):
        """
        Args:
          prior: an R.SdPrior object describing the prior distribution on the
            residual variance paramter.
          data:  The time series of observations as a Pandas Series.

        Returns:
          A boom.StateSpaceModel object.
        """
        boom_data = boom.Vector(data.values)
        is_observed = ~data.isna()
        self._model = boom.StateSpaceModel(boom_data, is_observed)

        if prior is None:
            sdy = np.std(data)
            prior = R.SdPrior(sigma_guess=sdy, upper_limit=sdy * 1.2)

        boom_prior = boom.ChisqModel(prior.sample_size, prior.sigma_guess)
        observation_model_sampler = boom.ZeroMeanGaussianConjSampler(
            self._model.observation_model,
            boom_prior)
        observation_model_sampler.set_sigma_upper_limit(
            prior.upper_limit)
        self._model.observation_model.set_method(observation_model_sampler)

        sampler = boom.StateSpacePosteriorSampler(
            self._model, boom.GlobalRng.rng)
        self._model.set_method(sampler)

        self._original_series = data

        return self._model
Esempio n. 15
0
    def simulate_data_from_model(time_dimension: int, typical_sample_size: int,
                                 xdim: int, residual_sd: float,
                                 unscaled_innovation_sd: np.ndarray,
                                 p00: np.ndarray, p11: np.ndarray):
        from BayesBoom.R import rmarkov
        inclusion = np.full((xdim, time_dimension), -1)
        p00 = p00.ravel()
        p11 = p11.ravel()
        for j in range(xdim):
            P = np.array([[p00[j], 1 - p00[j]], [1 - p11[j], p11[j]]])
            inclusion[j, :] = rmarkov(time_dimension, P)

        coefficients = np.zeros((xdim, time_dimension))
        for j in range(xdim):
            sd = unscaled_innovation_sd[j] * residual_sd
            for t in range(time_dimension):
                prev = 0 if t == 0 else coefficients[j, t - 1]
                coefficients[j,
                             t] = inclusion[j, t] * (prev +
                                                     np.random.randn(1) * sd)

        data = []
        for t in range(time_dimension):
            sample_size = np.random.poisson(typical_sample_size, 1)[0]
            X = np.random.randn(sample_size, xdim)
            X[:, 0] = 1.0
            yhat = X @ coefficients[:, t]
            y = yhat + residual_sd * np.random.randn(sample_size)
            data.append(
                boom.RegressionDataTimePoint(boom.Matrix(X), boom.Vector(y)))
        return data, coefficients, inclusion
Esempio n. 16
0
def to_data_table(data: pd.DataFrame):
    """
    Create a BOOM DataTable object from a pandas DataFrame.  The categories of
    any categorical variables will be handled as strings.
    """
    dtypes = data.dtypes
    ans = boom.DataTable()
    for i in range(data.shape[1]):
        dt = dtypes[i]
        vname = data.columns[i]
        if is_numeric_dtype(dt) or is_bool_dtype(dt):
            ans.add_numeric(boom.Vector(data.iloc[:, i].values.astype("float")),
                            vname)
        elif is_categorical_dtype(dt):
            x = data.iloc[:, i]
            values = x.cat.codes
            codes = x.cat.categories
            ans.add_categorical(values, codes, vname)
        elif is_object_dtype(dt):
            labels = data.iloc[:, i].astype("str")
            ans.add_categorical_from_labels(labels.values, vname)
        else:
            raise Exception(
                f"Only numeric or categorical data are supported.  "
                f"Column {i} ({data.columns[i]}) has dtype {dt}."
            )
    return ans
Esempio n. 17
0
    def create_model(self, prior, data):
        """Create the boom model object, and store related model artifacts.

        Args:
          formula:  A model formula describing the regression component.
          data: A pandas DataFrame containing the variables appearing
            'formula'.
          prior: A spikeslab.RegressionSpikeSlabPrior describing the prior
            distribution on the regression coefficients and the residual
            standard deviation.

        Effects: self._model is created, and model formula artifacts are stored
          so they will be available for future predictions.

        """
        if not isinstance(prior, spikeslab.RegressionSpikeSlabPrior):
            raise Exception("Unexpected type for prior.")
        response, predictors = patsy.dmatrices(self._formula, data)
        is_observed = ~np.isnan(response)

        self._model = boom.StateSpaceRegressionModel(
            boom.Vector(response),
            boom.Matrix(predictors),
            is_observed)

        spikeslab.set_posterior_sampler(self._model.observation_model, prior)
        self._original_series = response

        return self._model
Esempio n. 18
0
 def _default_initial_state_prior(self, sdy):
     """
     The default prior to use for the initial state vector.
     """
     dim = self.nseasons - 1
     return boom.MvnModel(
         boom.Vector(np.zeros(dim).astype(float)),
         boom.SpdMatrix(np.diag(np.full(dim, float(sdy)))))
Esempio n. 19
0
def to_boom_vector(v):
    """
    Convert the vector-like object 'v' to a boom.Vector.  This is a more user
    friendly experience than relying on the boom.Vector constructor, which only
    accepts floating point numpy arrays.  Here 'v' can be a numeric scalar, a
    numpy array of any numeric dtype, a pandas Series of any numeric dtype, or
    any similar object that either acts like a pd.Series or is convertible to a
    np.array.
    """
    if hasattr(v, "values"):
        # Handle pd.Series and similar.
        return boom.Vector(np.array(v.values, dtype="float"))

    if isinstance(v, Number):
        return boom.Vector(np.array([v], dtype="float"))

    return boom.Vector(np.array(v, dtype="float"))
Esempio n. 20
0
 def stream_data_for_restricted_model(self, x: np.ndarray, y: np.ndarray):
     """
     After the initial_screen has been run, the data will need to be
     streamed a second time.  The arguments here are identical to
     'stream_data_for_initial_screen'.
     """
     for i, yi in enumerate(y):
         data_point = boom.RegressionData(yi, boom.Vector(x[i, :]))
         self._sampler.stream_data_for_restricted_model(data_point)
Esempio n. 21
0
 def _create_posterior_sampler(self, residual_precision_prior,
                               coefficient_innovation_priors,
                               prior_inclusion_probabilities,
                               expected_inclusion_duration,
                               transition_probability_prior_sample_size):
     sampler = boom.DynamicRegressionDirectGibbsSampler(
         self._model, residual_precision_prior.sigma_guess,
         residual_precision_prior.sample_size,
         boom.Vector(
             np.array(
                 [x.sigma_guess for x in coefficient_innovation_priors])),
         boom.Vector(
             np.array(
                 [x.sample_size for x in coefficient_innovation_priors])),
         boom.Vector(prior_inclusion_probabilities),
         boom.Vector(expected_inclusion_duration),
         boom.Vector(transition_probability_prior_sample_size))
     self._model.set_method(sampler)
 def test_data(self):
     model = boom.GaussianModel(0, 1)
     mu = -16
     sigma = 7
     data = np.random.randn(10000) * sigma + mu
     model.set_data(boom.Vector(data))
     model.mle()
     self.assertLess(np.abs(model.mean - mu),
                     4 * sigma / np.sqrt(len(data)))
     self.assertLess(np.abs(model.sd - sigma), .1)
Esempio n. 23
0
    def _build_model(self):
        self._state_model = boom.SeasonalStateModel(
            nseasons=self._nseasons, season_duration=self._season_duration)

        self._state_model.set_initial_state_mean(
            boom.Vector(self._initial_state_prior.mu))
        self._state_model.set_initial_state_variance(
            boom.SpdMatrix(self._initial_state_prior.Sigma))

        # The prior needs to be saved so the object can be serialized.
        self._assign_posterior_sampler(self._innovation_sd_prior)
Esempio n. 24
0
    def setUp(self):
        np.random.seed(8675309)
        self.data = np.random.randn(100, 3)
        self.Sigma = boom.SpdMatrix(
            np.array([[1, .8, -.3], [.8, 2, -.6], [-.3, -.6, 4]]))

        chol = boom.Cholesky(self.Sigma)
        L = chol.getL(True).to_numpy()
        self.data = self.data @ L.T
        self.mu = np.array([1, 2, -3.0])
        self.data += self.mu
        self.mu = boom.Vector(self.mu)
Esempio n. 25
0
    def model_smoke_test(self):
        xdim = 3
        typical_sample_size = 30
        time_dimension = 12
        model = boom.DynamicRegressionModel(xdim)

        data = self.simulate_null_data(time_dimension, typical_sample_size,
                                       xdim)

        for dp in data:
            model.add_data(dp)

        sampler = boom.DynamicRegressionDirectGibbsSampler(
            model, 1.0, 1.0, boom.Vector(np.array([1.0] * xdim)),
            boom.Vector(np.array([1.0] * xdim)),
            boom.Vector(np.array([.25] * xdim)),
            boom.Vector(np.array([2.0] * xdim)),
            boom.Vector(np.array([1.0] * xdim)), boom.GlobalRng.rng)

        model.set_method(sampler)
        for _ in range(10):
            model.sample_posterior()
 def test_mcmc(self):
     true_sigma = 2.3
     data = np.random.randn(100) * true_sigma
     prior = boom.ChisqModel(1.0, 1.0)
     self.model.set_data(boom.Vector(data))
     sampler = boom.ZeroMeanGaussianConjSampler(self.model, prior)
     self.model.set_method(sampler)
     niter = 1000
     draws = np.zeros(niter)
     for i in range(niter):
         self.model.sample_posterior()
         draws[i] = self.model.sigma
     self.assertNotAlmostEqual(draws[0], draws[-1])
Esempio n. 27
0
    def _restore_parameters(self, i):
        self._model.set_coefficients(boom.Matrix(
            self.coefficient_draws[i, :, :]))
        self._model.set_residual_variance(boom.Matrix(
            self.residual_variance_draws[i, :, :]))
        for cluster in range(self.nclusters):
            for col in range(len(self._numeric_colnames)):
                name = self._numeric_colnames[col]
                probs = self.atom_probs[name][i, cluster, :]
                self._model.set_atom_probs(cluster, col, boom.Vector(probs))

                error_probs = self.atom_error_probs[name][i, cluster, :, :]
                self._model.set_atom_error_probs(
                    cluster, col, boom.Matrix(error_probs))
Esempio n. 28
0
    def test_data(self):
        time_point = boom.RegressionDataTimePoint()
        r1 = boom.RegressionData(1.0, np.random.randn(3))
        time_point.add_data(r1)
        self.assertEqual(3, time_point.xdim)
        self.assertEqual(1, time_point.sample_size)

        y = np.random.randn(10)
        x = np.random.randn(10, 3)
        for i in range(10):
            reg_data = boom.RegressionData(y[i], boom.Vector(x[i, :]))
            time_point.add_data(reg_data)

        self.assertEqual(11, time_point.sample_size)
Esempio n. 29
0
    def _set_prior(self):
        """
        Set the prior distribution on the BOOM model object.  If user-specified
        priors have been set using set_atom_prior, set_atom_error_prior, etc
        then those priors will be installed.  Variables for which no prior was
        specified will receive default priors.
        """
        self._set_default_regression_prior()
        self._set_default_prior_for_mixing_weights()

        for i in range(len(self._numeric_colnames)):
            vname = self._numeric_colnames[i]
            if vname not in self._atom_prior:
                self._atom_prior[vname] = self._default_atom_prior(
                    self._atoms[vname])
            self._model.set_atom_prior(boom.Vector(self._atom_prior[vname]), i)

            if vname not in self._atom_error_prior:
                self._atom_error_prior[vname] = self._default_atom_error_prior(
                    len(self._atoms[vname]))
            self._model.set_atom_error_prior(
                boom.Matrix(self._atom_error_prior[vname]), i)

        for i in range(len(self._categorical_colnames)):
            vname = self._categorical_colnames[i]
            levels = self._levels[vname]
            if vname not in self._level_prior:
                self._level_prior[vname] = self._default_level_prior(levels)
            self._model.set_level_prior(
                boom.Vector(np.array(self._level_prior[vname])), i)

            if vname not in self._level_observation_prior:
                self._level_observation_prior[vname] = (
                    self._default_level_observation_prior(levels))
            self._model.set_level_observation_prior(
                boom.Matrix(self._level_observation_prior[vname]), i)
    def test_mcmc():
        model = boom.GaussianModel()
        mu = -16
        sigma = 7
        data = np.random.randn(10000) * sigma + mu
        model.set_data(boom.Vector(data))

        mean_prior = boom.GaussianModelGivenSigma(
            model.sigsq_parameter,
            mu,
            1.0)
        sigsq_prior = boom.ChisqModel(1.0, sigma)
        sampler = boom.GaussianConjugateSampler(
            model, mean_prior, sigsq_prior)
        model.set_method(sampler)
        for _ in range(100):
            model.sample_posterior()