def _fit_model(self, model_type, regressor, X, y): """Private method to fit a model using sklearn or statsmodels.""" # encoding for predictor variable # we enforce that predictors were imputed in imputation phase. X = _one_hot_encode(X) self.new_X_columns = X.columns.tolist() # encoding for response variable if model_type == "logistic": ycat = y.astype("category").cat y = ycat.codes self._response_categories = ycat.categories # statsmodels fit case, which requires different logic than sklearn if self.model_lib == "statsmodels": X = add_constant(X) if self.model_kwgs: model = regressor(y, X, **self.model_kwgs) else: model = regressor(y, X) model = model.fit() # sklearn fit case, which requires different logic than statsmodels if self.model_lib == "sklearn": if self.model_kwgs: model = regressor(**self.model_kwgs) else: model = regressor() # sklearn doesn't need encoding for response model.fit(X, y) # return the model after fitting it to a given dataset return model
def _predict_strategy_validator(self, instance, X): """Private method to validate before prediction.""" # first check that model is fitted, then check columns are the same check_is_fitted(instance, "statistics_") X_cols = X.columns.tolist() fit_cols = set(instance.fit_X_columns) diff_fit = set(fit_cols).difference(X_cols) if diff_fit: err = "Same columns that were fit must appear in predict." raise ValueError(err) # encoding for predictor variable # we enforce that predictors were imputed in imputation phase. if X.isnull().sum().any(): me = "Data passed to make predictions can't contain missingness." raise ValueError(me) X = _one_hot_encode(X) return X
def fit(self, X, y=None, imp_ixs=None): """Fit specified imputation methods to each column within a DataFrame. The fit method calculates the `statistics` necessary to later transform a dataset (i.e. perform actual imputations). Inductive methods calculate statistic on the fit data, then impute new missing data with that value. Most currently supported methods are inductive. It's important to note that we have to fit X regardless of whether any data is missing. Transform step may have missing data if new data is used, so fit each column that appears in the given strategies. Args: X (pd.DataFrame): pandas DataFrame on which imputer is fit. y (pd.Series, pd.DataFrame Optional): response. Default is None. Determined interally in fit method. Arg is present to remain compatible with sklearn Pipelines. imp_ixs (dict): Dictionary of lists of indices that indicate which data elements to impute per column or None to identify from missing elements per column Returns: self: instance of the SingleImputer class. Raises: ValueError: error in specification of strategies. Raised through `check_strategy_fit`. See its docstrings for more info. ValueError: error in specification of predictors. Raised through `check_predictors_fit`. See its docstrings for more info. """ # first, prep columns we plan to use and make sure they are valid self._fit_strategy_validator(X) self.statistics_ = {} # perform fit on each column, depending on that column's strategy # note that right now, operations are COLUMN-by-COLUMN, iteratively if self.seed is not None: np.random.seed(self.seed) for column, method in self._strats.items(): imp = self.strategies[method] imp_params = self._fit_init_params(column, method, self.imp_kwgs) # try to create an instance of the imputer, given the args try: if imp_params is None: imputer = imp() else: imputer = imp(**imp_params) except TypeError as te: name = imp.__name__ err = f"Invalid arguments passed to {name} __init__ method." raise ValueError(err) from te # identify the column for imputation ys = X[column] # the fit depends on what type of strategy we use. # first, fit univariate methods, which are straightforward. if method in self.univariate_strategies: imputer.fit(ys, None) # now, fit on predictive methods, which are more complex. if method in self.predictive_strategies: preds = self._preds[column] if preds == "all": xs = X.drop(column, axis=1) else: xs = X[preds] if imp_ixs is not None: ys[imp_ixs[column]] = np.nan # fit the data on observed values only. x_, y_ = _get_observed(xs, ys) # before imputing, need to encode categoricals x_ = _one_hot_encode(x_) imputer.fit(x_, y_) # finally, store imputer for each column as statistics self.statistics_[column] = imputer return self
def transform(self, X, imp_ixs=None): """Impute each column within a DataFrame using fit imputation methods. The transform step performs the actual imputations. Given a dataset previously fit, `transform` imputes each column with it's respective imputed values from fit (in the case of inductive) or performs new fit and transform in one sweep (in the case of transductive). Args: X (pd.DataFrame): DataFrame to impute (same as fit or new data). imp_ixs (dict): Dictionary of lists of indices that indicate which data elements to impute per column or None to identify from missing elements per column Returns: X (pd.DataFrame): imputed in place or copy of original. Raises: ValueError: same columns must appear in fit and transform. Raised through _transform_strategy_validator. """ # copy the dataset if necessary, then prep predictors if self.copy: X = X.copy() self._transform_strategy_validator(X) # transformation logic self.imputed_ = {} if self.seed is not None: np.random.seed(self.seed) for column, imputer in self.statistics_.items(): if imp_ixs is None: imp_ix = X[column][X[column].isnull()].index else: imp_ix = pd.Index(imp_ixs[column]) self.imputed_[column] = imp_ix.tolist() # continue if there are no imputations to make if imp_ix.empty: continue # implement transform logic for univariate if imputer.strategy in self.univariate_strategies: x_ = X[column] # implement transform logic for predictive if imputer.strategy in self.predictive_strategies: preds = self._preds[column] if preds == "all": x_ = X.drop(column, axis=1) else: x_ = X[preds] # isolate missingness if isinstance(x_, pd.Series): x_ = x_.to_frame() x_ = x_.loc[imp_ix] else: x_ = x_.loc[imp_ix, :] # default univariate impute for missing covariates mis_cov = pd.isnull(x_).sum() mis_cov = mis_cov[mis_cov > 0] if any(mis_cov): x_m = mis_cov.index for col in x_m: d = DefaultUnivarImputer() if mis_cov[col] == x_.shape[0]: d_imps = 0 else: d_imps = d.fit_impute(x_[col], None) x_null = x_[col][x_[col].isnull()].index x_.loc[x_null, col] = d_imps # handling encoding again for prediction of imputations x_ = _one_hot_encode(x_) # perform imputation given the specified imputer and value for x_ X.loc[imp_ix, column] = imputer.impute(x_) return X
def transform(self, X): """Impute each column within a DataFrame using fit imputation methods. The transform step performs the actual imputations. Given a dataset previously fit, `transform` imputes each column with it's respective imputed values from fit (in the case of inductive) or performs new fit and transform in one sweep (in the case of transductive). Args: X (pd.DataFrame): DataFrame to impute (same as fit or new data). Returns: X (pd.DataFrame): imputed in place or copy of original. Raises: ValueError: same columns must appear in fit and transform. Raised through _transform_strategy_validator. """ # copy the dataset if necessary, then prep predictors if self.copy: X = X.copy() self._transform_strategy_validator(X) if self.verbose: trans = "PERFORMING IMPUTATIONS ON DATA BASED ON FIT..." print(f"{trans}\n{'-'*len(trans)}") # transformation logic self.imputed_ = {} if self.seed is not None: np.random.seed(self.seed) for column, imputer in self.statistics_.items(): imp_ix = X[column][X[column].isnull()].index self.imputed_[column] = imp_ix.tolist() # print to console for transformation if self.verbose if self.verbose: strat = imputer.statistics_["strategy"] print(f"Transforming {column} with strategy '{strat}'") if not imp_ix.empty: print(f"Numer of imputations to perform: {imp_ix.size}") else: print(f"No imputations, moving to next column...") # continue if there are no imputations to make if imp_ix.empty: continue # implement transform logic for univariate if imputer.strategy in self.univariate_strategies: x_ = X[column] # implement transform logic for predictive if imputer.strategy in self.predictive_strategies: preds = self._preds[column] if preds == "all": x_ = X.drop(column, axis=1) else: x_ = X[preds] # isolate missingness if isinstance(x_, pd.Series): x_ = x_.to_frame() x_ = x_.loc[imp_ix] else: x_ = x_.loc[imp_ix, :] # default univariate impute for missing covariates mis_cov = pd.isnull(x_).sum() mis_cov = mis_cov[mis_cov > 0] if any(mis_cov): x_m = mis_cov.index if self.verbose: print(f"Missing Covariates:\n{mis_cov}\n") print("Using single imputer for missing covariates...") for col in x_m: d = DefaultUnivarImputer() d_imps = d.fit_impute(x_[col], None) x_null = x_[col][x_[col].isnull()].index x_.loc[x_null, col] = d_imps # handling encoding again for prediction of imputations x_ = _one_hot_encode(x_) # perform imputation given the specified imputer and value for x_ X.loc[imp_ix, column] = imputer.impute(x_) return X