Example #1
0
    def __setstate__(self, state):
        """
        Retrieve a MixedDataImputer from a pickle.
        """
        self._atoms = state["atoms"]
        self._numeric_colnames = state["numeric_colnames"]
        self._categorical_colnames = state["categorical_colnames"]
        self._atom_prior = state["atom_prior"]
        self._dataset_encoder = state["dataset_encoder"]
        self.coefficient_draws = state["coefficient_draws"]
        self.residual_variance_draws = state["residual_variance_draws"]
        self.atom_probs = state["atom_probs"]
        self.empirical_distributions = state["empirical_distributions"]

        if (
                self._dataset_encoder is not None
                and state["nclusters"] is not None
        ):
            xdim = self._encoder.dim
            atom_vector = []
            for vname in self._numeric_colnames:
                atoms = np.array(self._atoms[vname])
                atom_vector.append(boom.Vector(
                    atoms.flatten().astype("float")))
            self._model = boom.MixedDataImputer(
                # TODO: this is hosed.
            )
            state["nclusters"], atom_vector, xdim
            self._model.set_empirical_distributions(
                state["empirical_distributions"])
Example #2
0
    def train_model(self,
                    data: pd.DataFrame,
                    nclusters: int,
                    niter: int,
                    ping: int = 0,
                    checkpoint_filename: str = ""):
        self._start_time = time.time()
        if self._dtypes is None:
            self._dtypes = data.dtypes

        if self._atoms is None:
            self._numeric_colnames = []
            self._categorical_colnames = []
            self._atoms, self._levels = self.find_atoms(data)

        atoms_arg = [
            boom.Vector(np.array(self._atoms[vname]))
            for vname in self._numeric_colnames
        ]
        table_arg = to_data_table(data)
        print("creating mdoel object")
        self._model = boom.MixedDataImputer(nclusters, table_arg, atoms_arg,
                                            boom.GlobalRng.rng)

        print("setting prior")
        self._set_prior()

        print("allocating space")
        self._allocate_space(niter)

        print("about to start mcmc")
        for i in range(niter):
            if (ping > 0) and (i % ping == 0):
                sep = "=-=-=-=-=-=-=-=-="
                print(f"{sep} {time.asctime()} Iteration {i} {sep}")
            self._model.sample_posterior()
            self._record_draws(i)
            if i % niter == 0 and checkpoint_filename != "":
                self.save(checkpoint_filename)

        self._end_time = time.time()
Example #3
0
    def train_model(self,
                    data: pd.DataFrame,
                    nclusters: int,
                    niter: int = 1000,
                    checkpoint_filename="",
                    nthreads=1):
        """
        Train the imputation model by taking 'niter' draws from the posterior
        distribution given a set of training data.

        Args:
          data: The data on which to train the model.  Any non-numeric
            variables are treated as categorical.  Missing values are expected
            to be coded as np.NaN.
          nclusters: The number of clusters to use for the joint
            distribution of the categorical data.
          niter:  The number of MCMC iterations to use during training.
          checkpoint_filename: The name of a file to use when recording the
            model's state.  The model will checkpoint its history every so
            often during training.
          nthreads: The number of threads to use during training.  This is
            experimental.

        Effects:
          self._coefficient_draws: Created and populated by MCMC draws of the
            linear regression coefficients.

        """
        self._start_time = time.time()
        if self._numeric_colnames is None or self._categorical_colnames is None:
            self._discover_column_types(data)

        # Find the set of atoms for each numeric variable.
        if self._atoms is None:
            self.find_atoms(data.loc[:, self._numeric_colnames])
        if len(self._atoms) != len(self._numeric_colnames):
            raise Exception(
                "One atom vector is needed for each numeric variable.")

        # Find the set of levels for each categorical variable.
        if self._levels is not None:
            self.find_levels(data.loc[:, self._categorical_colnames])
        if len(self._levels) != len(self._categorical_colnames):
            raise Exception(
                "Each categorical column must have its own list of levels.")

        # Convert the atoms to BOOM::Vector objects.
        atom_vector = []
        for vname in self._numeric_colnames:
            atoms = np.array(self._atoms[vname])
            atom_vector.append(boom.Vector(atoms.flatten().astype("float")))

        # The data table as a BOOM object.
        data_table = boom.to_data_table(data)

        self._model = boom.MixedDataImputer(
            nclusters, data_table, atom_vector, boom.GlobalRng.rng)

        # if nthreads > 1:
        #     self._model.setup_worker_pool(nthreads)

        print("setting prior")
        self._set_prior()

        print("Allocating space")
        self._allocate_space(niter)

        for i in range(niter):
            if i % 100 == 0:
                sep = "=-=-=-=-=-=-=-=-="
                print(f"{sep} {time.asctime()} Iteration {i} {sep}")
            self._model.sample_posterior()
            self._record_draws(i)
            if i % niter == 0 and checkpoint_filename != "":
                self.save(checkpoint_filename)
        self._end_time = time.time()