def test_NumImputer_is_picklable(): df = get_sample_df(100, seed=123) df.loc[[2, 10, 50], "float_col"] = np.nan imputer = NumImputer() _ = imputer.fit_transform(df) pickled_imputer = pickle.dumps(imputer) unpickled_imputer = pickle.loads(pickled_imputer) assert type(unpickled_imputer) == type(imputer) X1 = imputer.transform(df) X2 = unpickled_imputer.transform(df) assert X1.shape == X2.shape assert (X1 == X2).all().all()
def test_NumImputer_output_type(): df = get_sample_df(100, seed=123) # with type float64 df["float_col"].astype("float64") imp = NumImputer() Xenc = imp.fit_transform(df) assert Xenc.dtypes["float_col"] == df.dtypes["float_col"] # with type float32 df2 = df.copy() df2["float_col"] = df2["float_col"].astype("float32") imp = NumImputer() Xenc = imp.fit_transform(df2) assert Xenc.dtypes["float_col"] == df2.dtypes["float_col"] # with type float64 df["float_col"].astype("float64") df.loc[0, "float_col"] = np.nan imp = NumImputer() Xenc = imp.fit_transform(df) assert Xenc.dtypes["float_col"] == df.dtypes["float_col"] # with type float32 df2 = df.copy() df2["float_col"] = df2["float_col"].astype("float32") df2.loc[0, "float_col"] = np.nan imp = NumImputer() Xenc = imp.fit_transform(df2) assert Xenc.dtypes["float_col"] == df2.dtypes["float_col"]
def test__NumImputer(): xx, xxd, xxs = get_sample_data(add_na=True) xxd.index = np.array([0, 1, 2, 3, 4, 10, 11, 12, 12, 14]) # DataFrame entry for inp in (_NumImputer(), NumImputer(), _NumImputer(add_is_null=False), NumImputer(add_is_null=False)): xx_out = inp.fit_transform(xxd) assert (xx_out.index == xxd.index).all() assert pd.isnull(xxd.loc[0, "col1"]) # Verify that it is still null assert xx_out.isnull().sum().sum() == 0 assert xx_out["col1"][0] == xxd.loc[~xxd["col1"].isnull(), "col1"].mean() assert xx_out.shape[0] == xx.shape[0] assert get_type(xx_out) == get_type(xxd) if inp.add_is_null: assert inp.get_feature_names() == [ "col0", "col1", "col2", "col3", "col4", "col5", "col6", "col1_isnull" ] assert xx_out.shape[1] == 1 + xxd.shape[1] assert xx_out["col1_isnull"].iloc[0] == 1 assert xx_out["col1_isnull"].iloc[5] == 1 assert (xx_out["col1_isnull"].iloc[np.array( [1, 2, 3, 4, 6, 7, 8, 9])] == 0).all() else: assert xx_out.shape[1] == xxd.shape[1] assert inp.get_feature_names() == [ "col0", "col1", "col2", "col3", "col4", "col5", "col6" ] inp = _NumImputer(add_is_null=False, allow_unseen_null=False) inp.fit(xxd) xxd2 = xxd.copy() xxd2.iloc[0, 3] = np.nan try: inp.transform(xxd2) raise AssertionError("Model should have fail its transformation") except ValueError: pass input_features = ["COL_%d" % i for i in range(xx.shape[1])] # Numpy array for inp in (_NumImputer(), NumImputer()): xx_out = inp.fit_transform(xx) assert pd.isnull(xx[0, 1]) assert pd.isnull(xx_out).sum() == 0 assert xx_out.shape[1] == 1 + xx.shape[1] assert xx_out.shape[0] == xx.shape[0] assert get_type(xx_out) == get_type(xx) assert inp.get_feature_names() == [ "0", "1", "2", "3", "4", "5", "6", "1_isnull" ] assert inp.get_feature_names( input_features) == input_features + ["COL_1_isnull"] assert xx_out[0, 7] == 1 assert xx_out[5, 7] == 1 assert (xx_out[np.array([1, 2, 3, 4, 6, 7, 8, 9]), 7] == 0).all() # Sparse Array for inp in (_NumImputer(), NumImputer()): for f in (sps.coo_matrix, sps.csc_matrix, sps.csr_matrix): xxsf = f(xxs.copy()) xx_out = inp.fit_transform(xxsf) assert pd.isnull(xxs[0, 1]) assert pd.isnull(xx_out.todense()).sum() == 0 assert get_type(xx_out) == get_type(xxs) assert xx_out.shape[1] == 1 + xxs.shape[1] assert xx_out.shape[0] == xx.shape[0] assert inp.get_feature_names() == [ "0", "1", "2", "3", "4", "5", "6", "1_isnull" ] assert inp.get_feature_names( input_features) == input_features + ["COL_1_isnull"] assert xx_out.todense()[0, 7] == 1 assert xx_out.todense()[0, 7] == 1 assert (xx_out.todense()[np.array([1, 2, 3, 4, 6, 7, 8, 9]), 7] == 0).all() xx, xxd, xxs = get_sample_data(add_na=False) xxd.index = np.array([0, 1, 2, 3, 4, 10, 11, 12, 12, 14]) # DataFrame entry for inp in (_NumImputer(), NumImputer()): xx_out = inp.fit_transform(xxd) assert (xx_out.index == xxd.index).all() assert xx_out.isnull().sum().sum() == 0 assert xx_out.shape[1] == xxd.shape[1] assert xx_out.shape[0] == xx.shape[0] assert get_type(xx_out) == get_type(xxd) assert inp.get_feature_names() == [ "col0", "col1", "col2", "col3", "col4", "col5", "col6" ] # Numpy array for inp in (_NumImputer(), NumImputer()): xx_out = inp.fit_transform(xx) assert pd.isnull(xx_out).sum() == 0 assert xx_out.shape[1] == xx.shape[1] assert xx_out.shape[0] == xx.shape[0] assert get_type(xx_out) == get_type(xx) assert inp.get_feature_names() == ["0", "1", "2", "3", "4", "5", "6"] assert inp.get_feature_names( input_features=input_features) == input_features # Sparse Array for inp in (_NumImputer(), NumImputer()): for f in (sps.coo_matrix, sps.csc_matrix, sps.csr_matrix): xxs_f = f(xxs.copy()) xx_out = inp.fit_transform(xxs_f) assert pd.isnull(xx_out.todense()).sum() == 0 assert get_type(xx_out) == get_type(xxs) assert xx_out.shape[1] == xxs.shape[1] assert xx_out.shape[0] == xx.shape[0] assert inp.get_feature_names() == [ "0", "1", "2", "3", "4", "5", "6" ] assert inp.get_feature_names( input_features=input_features) == input_features