Python MixedDataImputer Examples

Programming Language: Python

Namespace/Package Name: BayesBoom.boom

Method/Function: MixedDataImputer

Examples at hotexamples.com: 3

Python MixedDataImputer - 3 examples found. These are the top rated real world Python examples of BayesBoom.boom.MixedDataImputer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def __setstate__(self, state):
        """
        Retrieve a MixedDataImputer from a pickle.
        """
        self._atoms = state["atoms"]
        self._numeric_colnames = state["numeric_colnames"]
        self._categorical_colnames = state["categorical_colnames"]
        self._atom_prior = state["atom_prior"]
        self._dataset_encoder = state["dataset_encoder"]
        self.coefficient_draws = state["coefficient_draws"]
        self.residual_variance_draws = state["residual_variance_draws"]
        self.atom_probs = state["atom_probs"]
        self.empirical_distributions = state["empirical_distributions"]

        if (
                self._dataset_encoder is not None
                and state["nclusters"] is not None
        ):
            xdim = self._encoder.dim
            atom_vector = []
            for vname in self._numeric_colnames:
                atoms = np.array(self._atoms[vname])
                atom_vector.append(boom.Vector(
                    atoms.flatten().astype("float")))
            self._model = boom.MixedDataImputer(
                # TODO: this is hosed.
            )
            state["nclusters"], atom_vector, xdim
            self._model.set_empirical_distributions(
                state["empirical_distributions"])

Example #2

Show file

File: data_table.py Project: autumnjolitz/BOOM

    def train_model(self,
                    data: pd.DataFrame,
                    nclusters: int,
                    niter: int,
                    ping: int = 0,
                    checkpoint_filename: str = ""):
        self._start_time = time.time()
        if self._dtypes is None:
            self._dtypes = data.dtypes

        if self._atoms is None:
            self._numeric_colnames = []
            self._categorical_colnames = []
            self._atoms, self._levels = self.find_atoms(data)

        atoms_arg = [
            boom.Vector(np.array(self._atoms[vname]))
            for vname in self._numeric_colnames
        ]
        table_arg = to_data_table(data)
        print("creating mdoel object")
        self._model = boom.MixedDataImputer(nclusters, table_arg, atoms_arg,
                                            boom.GlobalRng.rng)

        print("setting prior")
        self._set_prior()

        print("allocating space")
        self._allocate_space(niter)

        print("about to start mcmc")
        for i in range(niter):
            if (ping > 0) and (i % ping == 0):
                sep = "=-=-=-=-=-=-=-=-="
                print(f"{sep} {time.asctime()} Iteration {i} {sep}")
            self._model.sample_posterior()
            self._record_draws(i)
            if i % niter == 0 and checkpoint_filename != "":
                self.save(checkpoint_filename)

        self._end_time = time.time()

Example #3

Show file

    def train_model(self,
                    data: pd.DataFrame,
                    nclusters: int,
                    niter: int = 1000,
                    checkpoint_filename="",
                    nthreads=1):
        """
        Train the imputation model by taking 'niter' draws from the posterior
        distribution given a set of training data.

        Args:
          data: The data on which to train the model.  Any non-numeric
            variables are treated as categorical.  Missing values are expected
            to be coded as np.NaN.
          nclusters: The number of clusters to use for the joint
            distribution of the categorical data.
          niter:  The number of MCMC iterations to use during training.
          checkpoint_filename: The name of a file to use when recording the
            model's state.  The model will checkpoint its history every so
            often during training.
          nthreads: The number of threads to use during training.  This is
            experimental.

        Effects:
          self._coefficient_draws: Created and populated by MCMC draws of the
            linear regression coefficients.

        """
        self._start_time = time.time()
        if self._numeric_colnames is None or self._categorical_colnames is None:
            self._discover_column_types(data)

        # Find the set of atoms for each numeric variable.
        if self._atoms is None:
            self.find_atoms(data.loc[:, self._numeric_colnames])
        if len(self._atoms) != len(self._numeric_colnames):
            raise Exception(
                "One atom vector is needed for each numeric variable.")

        # Find the set of levels for each categorical variable.
        if self._levels is not None:
            self.find_levels(data.loc[:, self._categorical_colnames])
        if len(self._levels) != len(self._categorical_colnames):
            raise Exception(
                "Each categorical column must have its own list of levels.")

        # Convert the atoms to BOOM::Vector objects.
        atom_vector = []
        for vname in self._numeric_colnames:
            atoms = np.array(self._atoms[vname])
            atom_vector.append(boom.Vector(atoms.flatten().astype("float")))

        # The data table as a BOOM object.
        data_table = boom.to_data_table(data)

        self._model = boom.MixedDataImputer(
            nclusters, data_table, atom_vector, boom.GlobalRng.rng)

        # if nthreads > 1:
        #     self._model.setup_worker_pool(nthreads)

        print("setting prior")
        self._set_prior()

        print("Allocating space")
        self._allocate_space(niter)

        for i in range(niter):
            if i % 100 == 0:
                sep = "=-=-=-=-=-=-=-=-="
                print(f"{sep} {time.asctime()} Iteration {i} {sep}")
            self._model.sample_posterior()
            self._record_draws(i)
            if i % niter == 0 and checkpoint_filename != "":
                self.save(checkpoint_filename)
        self._end_time = time.time()