def __setstate__(self, state): """ Retrieve a MixedDataImputer from a pickle. """ self._atoms = state["atoms"] self._numeric_colnames = state["numeric_colnames"] self._categorical_colnames = state["categorical_colnames"] self._atom_prior = state["atom_prior"] self._dataset_encoder = state["dataset_encoder"] self.coefficient_draws = state["coefficient_draws"] self.residual_variance_draws = state["residual_variance_draws"] self.atom_probs = state["atom_probs"] self.empirical_distributions = state["empirical_distributions"] if ( self._dataset_encoder is not None and state["nclusters"] is not None ): xdim = self._encoder.dim atom_vector = [] for vname in self._numeric_colnames: atoms = np.array(self._atoms[vname]) atom_vector.append(boom.Vector( atoms.flatten().astype("float"))) self._model = boom.MixedDataImputer( # TODO: this is hosed. ) state["nclusters"], atom_vector, xdim self._model.set_empirical_distributions( state["empirical_distributions"])
def train_model(self, data: pd.DataFrame, nclusters: int, niter: int, ping: int = 0, checkpoint_filename: str = ""): self._start_time = time.time() if self._dtypes is None: self._dtypes = data.dtypes if self._atoms is None: self._numeric_colnames = [] self._categorical_colnames = [] self._atoms, self._levels = self.find_atoms(data) atoms_arg = [ boom.Vector(np.array(self._atoms[vname])) for vname in self._numeric_colnames ] table_arg = to_data_table(data) print("creating mdoel object") self._model = boom.MixedDataImputer(nclusters, table_arg, atoms_arg, boom.GlobalRng.rng) print("setting prior") self._set_prior() print("allocating space") self._allocate_space(niter) print("about to start mcmc") for i in range(niter): if (ping > 0) and (i % ping == 0): sep = "=-=-=-=-=-=-=-=-=" print(f"{sep} {time.asctime()} Iteration {i} {sep}") self._model.sample_posterior() self._record_draws(i) if i % niter == 0 and checkpoint_filename != "": self.save(checkpoint_filename) self._end_time = time.time()
def train_model(self, data: pd.DataFrame, nclusters: int, niter: int = 1000, checkpoint_filename="", nthreads=1): """ Train the imputation model by taking 'niter' draws from the posterior distribution given a set of training data. Args: data: The data on which to train the model. Any non-numeric variables are treated as categorical. Missing values are expected to be coded as np.NaN. nclusters: The number of clusters to use for the joint distribution of the categorical data. niter: The number of MCMC iterations to use during training. checkpoint_filename: The name of a file to use when recording the model's state. The model will checkpoint its history every so often during training. nthreads: The number of threads to use during training. This is experimental. Effects: self._coefficient_draws: Created and populated by MCMC draws of the linear regression coefficients. """ self._start_time = time.time() if self._numeric_colnames is None or self._categorical_colnames is None: self._discover_column_types(data) # Find the set of atoms for each numeric variable. if self._atoms is None: self.find_atoms(data.loc[:, self._numeric_colnames]) if len(self._atoms) != len(self._numeric_colnames): raise Exception( "One atom vector is needed for each numeric variable.") # Find the set of levels for each categorical variable. if self._levels is not None: self.find_levels(data.loc[:, self._categorical_colnames]) if len(self._levels) != len(self._categorical_colnames): raise Exception( "Each categorical column must have its own list of levels.") # Convert the atoms to BOOM::Vector objects. atom_vector = [] for vname in self._numeric_colnames: atoms = np.array(self._atoms[vname]) atom_vector.append(boom.Vector(atoms.flatten().astype("float"))) # The data table as a BOOM object. data_table = boom.to_data_table(data) self._model = boom.MixedDataImputer( nclusters, data_table, atom_vector, boom.GlobalRng.rng) # if nthreads > 1: # self._model.setup_worker_pool(nthreads) print("setting prior") self._set_prior() print("Allocating space") self._allocate_space(niter) for i in range(niter): if i % 100 == 0: sep = "=-=-=-=-=-=-=-=-=" print(f"{sep} {time.asctime()} Iteration {i} {sep}") self._model.sample_posterior() self._record_draws(i) if i % niter == 0 and checkpoint_filename != "": self.save(checkpoint_filename) self._end_time = time.time()