def fit(self, X, y): """Fit the Imputer to the dataset by fitting bayesian model. Args: X (pd.Dataframe): dataset to fit the imputer. y (pd.Series): response, which is eventually imputed. Returns: self. Instance of the class. """ _not_num_series(self.strategy, y) nc = len(X.columns) # initialize model for bayesian linear reg. Default vals for priors # assume data is scaled and centered. Convergence can struggle or fail # if not the case and proper values for the priors are not specified # separately, also assumes each beta is normal and "independent" # while betas likely not independent, this is technically a rule of OLS with pm.Model() as fit_model: alpha = pm.Normal("alpha", self.am, sd=self.asd) beta = pm.Normal("beta", self.bm, sd=self.bsd, shape=nc) sigma = pm.HalfCauchy("σ", self.sig) mu = alpha+beta.dot(X.T) score = pm.Normal("score", mu, sd=sigma, observed=y) self.statistics_ = {"param": fit_model, "strategy": self.strategy} return self
def impute(self, X): """Perform imputations using the statistics generated from fit. The impute method handles the actual imputation. Missing values in a given dataset are replaced with the respective mean from fit. Args: X (pd.Series): Dataset to impute missing data from fit. Returns: np.array -- imputed dataset. """ # check if fitted then impute with mean check_is_fitted(self, "statistics_") _not_num_series(self.strategy, X) omu = self.statistics_["param"] # mean of observed data idx = X.isnull() # missing data nO = sum(~idx) # number of observed m = sum(idx) # number to impute muhatk = stats.norm(omu, np.sqrt(1 / nO)) # imputation cross-terms *NOT* uncorrelated Ymi = stats.multivariate_normal( np.ones(m) * muhatk.rvs(), np.ones((m, m)) / nO + np.eye(m)).rvs() out = X.copy() out[idx] = Ymi return out
def fit(self, X, y): """Fit the Imputer to the dataset by fitting linear model. Args: X (pd.Dataframe): dataset to fit the imputer. y (pd.Series): response, which is eventually imputed. Returns: self. Instance of the class. """ _not_num_series(self.strategy, y) self.lm.fit(X, y) self.statistics_ = {"strategy": self.strategy} return self
def fit(self, X, y=None): """Fit the Imputer to the dataset and calculate the median. Args: X (pd.Series): Dataset to fit the imputer. y (None): ignored, None to meet requirements of base class Returns: self. Instance of the class. """ _not_num_series(self.strategy, X) median = X.median() self.statistics_ = {"param": median, "strategy": self.strategy} return self
def fit(self, X, y=None): """Fit Imputer to dataset and calculate mean and sample variance. Args: X (pd.Series): Dataset to fit the imputer. y (None): ignored, None to meet requirements of base class Returns: self. Instance of the class. """ # get the moments for the normal distribution of feature X _not_num_series(self.strategy, X) moments = (X.mean(), X.std()) self.statistics_ = {"param": moments, "strategy": self.strategy} return self
def impute(self, X): """Perform imputations using the statistics generated from fit. The impute method handles the actual imputation. Missing values in a given dataset are replaced with the respective median from fit. Args: X (pd.Series): Dataset to impute missing data from fit. Returns: float -- imputed dataset. """ # check is fitted then impute with median check_is_fitted(self, "statistics_") _not_num_series(self.strategy, X) imp = self.statistics_["param"] return imp
def fit(self, X, y): """Fit the Imputer to the dataset by fitting linear model. The fit step also generates predictions on the observed data. These predictions are necessary to derive the mean_squared_error, which is passed as a parameter to the impute phase. The MSE is used to create the normal error distribution from which the imptuer draws. Args: X (pd.Dataframe): dataset to fit the imputer. y (pd.Series): response, which is eventually imputed. Returns: self. Instance of the class. """ _not_num_series(self.strategy, y) self.lm.fit(X, y) preds = self.lm.predict(X) mse = mean_squared_error(y, preds) self.statistics_ = {"param": mse, "strategy": self.strategy} return self
def fit(self, X, y): """Fit the Imputer to the dataset by fitting bayesian and LS model. Args: X (pd.Dataframe): dataset to fit the imputer. y (pd.Series): response, which is eventually imputed. Returns: self. Instance of the class. """ _not_num_series(self.strategy, y) nc = len(X.columns) # get predictions for the data, which will be used for "closest" vals y_pred = self.lm.fit(X, y).predict(X) y_df = DataFrame({"y": y, "y_pred": y_pred}) # calculate bayes and use appropriate means for alpha and beta priors # here we specify the point estimates from the linear regression as the # means for the priors. This will greatly speed up posterior sampling # and help ensure that convergence occurs if self.am is None: self.am = self.lm.intercept_ if self.bm is None: self.bm = self.lm.coef_ # initialize model for bayesian linear reg. Default vals for priors # assume data is scaled and centered. Convergence can struggle or fail # if not the case and proper values for the priors are not specified # separately, also assumes each beta is normal and "independent" # while betas likely not independent, this is technically a rule of OLS with pm.Model() as fit_model: alpha = pm.Normal("alpha", self.am, sd=self.asd) beta = pm.Normal("beta", self.bm, sd=self.bsd, shape=nc) sigma = pm.HalfCauchy("σ", self.sig) mu = alpha + beta.dot(X.T) score = pm.Normal("score", mu, sd=sigma, observed=y) params = {"model": fit_model, "y_obs": y_df} self.statistics_ = {"param": params, "strategy": self.strategy} return self
def impute(self, X): """Perform imputations using the statistics generated from fit. The transform method handles the actual imputation. It constructs a normal distribution using the sample mean and variance from fit. It then imputes missing values with a random draw from the respective distribution. Args: X (pd.Series): Dataset to impute missing data from fit. Returns: np.array -- imputed dataset. """ # check if fitted and identify location of missingness check_is_fitted(self, "statistics_") _not_num_series(self.strategy, X) ind = X[X.isnull()].index # create normal distribution and sample from it imp_mean, imp_std = self.statistics_["param"] imp = norm(imp_mean, imp_std).rvs(size=len(ind)) return imp