def test_wrong_categorical_type(): """Test supported strategies but improper column type for strategy.""" cat_for_num = SingleImputer(strategy="categorical") with pytest.raises(TypeError): cat_for_num.fit_transform(dfs.df_num)
def test_bad_strategy(): """Test that strategies not supported throw a ValueError.""" with pytest.raises(ValueError): imp = SingleImputer(strategy="not_a_strategy") imp.fit_transform(dfs.df_num)
def test_wrong_numerical_type(): """Test supported strategies but improper column type for strategy.""" num_for_cat = SingleImputer(strategy={"cats": "mean"}) with pytest.raises(TypeError): num_for_cat.fit_transform(dfs.df_ts_mixed)
def test_partial_dependence_imputer(): """Test to ensure that edge case for partial dependence whandled""" imp = SingleImputer(strategy='stochastic') imp.fit_transform(dfs.df_partial_dependence)
def test_single_missing_column(): """Test that the imputer removes columns that are fully missing.""" with pytest.raises(ValueError): imp = SingleImputer() imp.fit_transform(dfs.df_col_miss)
def test_normal_unit_variance_imputer(): """Test normal unit variance imputer for numerical column""" imp_pmm = SingleImputer(strategy={"y": "normal unit variance"}, ) imp_pmm.fit_transform(dfs.df_bayes_reg)
""" Imputation (1st Round): Univariate Imputation - Imputation method: Quadratic spline interpolation - Impute selected k-features """ strategy = "interpolate" fill_strategy = "cubic" dict_strategy = dict() dict_imp_kwgs = dict() for i in idx_selected: dict_strategy.update({df.columns[i]: strategy}) dict_imp_kwgs.update({df.columns[i]: {'fill_strategy': fill_strategy}}) imp_x = SingleImputer(strategy=dict_strategy, imp_kwgs=dict_imp_kwgs) df_imputed = imp_x.fit_transform(df) plt.plot(df_imputed[df.columns[idx_selected[0]]], label='Imputed') plt.plot(df[df.columns[idx_selected[0]]], label='Actual') train_ratio = 0.8 split_idx = int(len(df) * train_ratio) x_train = df[:split_idx].values[1:] y_train = df[:split_idx].values[0] x_test = df[split_idx:].values[1:] y_test = df[split_idx:].values[0]
plt.xticks(range(X.shape[1]), indices) plt.xlim([-1, X.shape[1]]) plt.show() import os for ids in train['id']: print(ids) for file in os.listdir(text): if file.endswith(".gz"): print(file) data = pd.read_csv(text + file, compression='gzip') print(data.index[data['id'] == ids].tolist()) break from autoimpute.imputations import SingleImputer, MultipleImputer si = SingleImputer() # imputation methods, passing through the data once mi = MultipleImputer() # imputation methods, passing through the data multiple times # train_cols = list(train) X = train.drop(['id', 'target'], axis=1) X = MICE().fit_transform(X) X_test1 = test.drop(['id'], axis = 1) X_test1 = MICE().fit_transform(X_test) def scorer(true,pred): error = math.sqrt(mean_squared_error(pred,true)) return math.exp(-1*error) score = make_scorer(scorer, greater_is_better=True) X = train.drop(['id', 'target'], axis=1)
def plot_imp_scatter(d, x, y, strategy, color=None, title="Jointplot after Imputation", h=8.27, imp_kwgs=None, a=0.5, marginals=None, obs_color="navy", imp_color="red", **plot_kwgs): """Plot the joint scatter and density plot after single imputation. Use this method to visualize a scatterplot between two features, x and y, where y is imputed and x is a predictor used to impute y. This method performs single imputation and is useful to determine how an imputation method looks under the hood. Args: d (pd.DataFrame): DataFrame with data to impute and plot. x (str): column to plot on x axis. y (str): column to plot on y axis and set color for imputation. strategy (str): imputation method for SingleImputer. color (str, Optional): which variable to color with imputations. Deafult is none, which means y is colored. Other option is to color "x". Color should be the same as "x" or "y". title (str, Optional): title of plot. "Defualt is Jointplot after Imputation". h (float, Optional): height of the jointplot. Default is 8.27 imp_kwgs (dict, Optional): imp kwgs for SingleImputer procedure. Default is None. a (float, Optional): alpha for plot color. Default is 0.5 marginals (dict, Optional): dictionary of marginal plot args. Default is None, configured in code below. obs_color (str, Optional): color of observed. Default is navy. imp_color (str, Optional): color of imputations. Default is red. **plot_kwgs: keyword arguments used by sns.set. Raises: ValueError: x and y must be names of columns in data """ # plot setup and arg validation _default_plot_args(**plot_kwgs) _validate_kwgs(marginals) _validate_kwgs(imp_kwgs) if marginals is None: marginals = dict(rug=True, kde=True) # validate x and y selection if not x in d.columns or not y in d.columns: err = "x and y must be names of columns in data" raise ValueError(err) # create imputer with strategy and optional imp kwgs if imp_kwgs is None: imp = SingleImputer(strategy=strategy) else: imp = SingleImputer(strategy=strategy, imp_kwgs=imp_kwgs) # handling the color configuration if color is None: color = y else: if color == y: color = y elif color == x: color = x else: err = "color must be the same as `y` or `x`" raise ValueError(err) # configure and apply the imputer impute = imp.fit_transform(d) impute["colors"] = obs_color impute.loc[imp.imputed_[color], "colors"] = imp_color joints_color = impute["colors"] # create the joint plot joint_kws = dict(facecolor=joints_color, edgecolor=joints_color) g = sns.jointplot(x=x, y=y, data=impute, alpha=a, height=h, joint_kws=joint_kws, marginal_kws=marginals) # final plot config and title plt.subplots_adjust(top=0.925) g.fig.suptitle(title)
np.array -- imputed dataset. """ # check if fitted then impute with mean check_is_fitted(self, "statistics_") _not_num_series(self.strategy, X) omu = self.statistics_["param"] # mean of observed data idx = X.isnull() # missing data nO = sum(~idx) # number of observed m = sum(idx) # number to impute muhatk = stats.norm(omu, np.sqrt(1 / nO)) # imputation cross-terms *NOT* uncorrelated Ymi = stats.multivariate_normal( np.ones(m) * muhatk.rvs(), np.ones((m, m)) / nO + np.eye(m)).rvs() out = X.copy() out[idx] = Ymi return out def fit_impute(self, X, y=None): """Convenience method to perform fit and imputation in one go.""" return self.fit(X, y).impute(X) if __name__ == '__main__': from autoimpute.imputations import SingleImputer si = SingleImputer('normal unit variance') Yo = stats.norm(0, 1).rvs(100) df = pd.DataFrame(columns=['Yo'], index=range(200), dtype=float) df.loc[range(100), 'Yo'] = Yo si.fit_transform(df)