def test_DataCleaner_na_method_feature_sample_interaction(self): dc = DataCleaner( max_na_frac=0.01, feature_na_method="mean", na_method_transform="fill", na_method_fit="fill", ) df = self.test_df # Should be dropped df["maximum X"] = [np.nan] * len(df) # Should be filled via mean df["range X"] = [np.nan] * 100 + df["range X"].iloc[100:].tolist() # Should be filled by 39 df["minimum X"].iloc[40] = np.nan mean = df["range X"].loc[~df["range X"].isnull()].mean() df = dc.fit_transform(df, self.target) self.assertNotIn("maximum X", df.columns) self.assertIn("range X", df.columns) for r in df["range X"].iloc[:100]: self.assertAlmostEqual(r, mean, places=5) self.assertIn("minimum X", df.columns) self.assertEqual(df["minimum X"].iloc[40], df["minimum X"].iloc[39])
def test_DataCleaner_feature_na_method(self): dc = DataCleaner(max_na_frac=0, feature_na_method="drop") df = self.test_df df["LUMO_energy"].iloc[40] = np.nan df["LUMO_energy"].iloc[110] = np.nan # Test normal dropping with transformation dffit = df.iloc[:100] fitted = dc.fit_transform(dffit, target=self.target) self.assertNotIn("LUMO_energy", fitted.columns) dftrans = df.iloc[100:] tranz = dc.transform(dftrans, target=self.target) self.assertNotIn("LUMO_energy", tranz.columns) # Test filling dc2 = DataCleaner(max_na_frac=0, feature_na_method="fill") fitted = dc2.fit_transform(dffit, target=self.target) true = fitted["LUMO_energy"].iloc[39] filled = fitted["LUMO_energy"].iloc[40] self.assertAlmostEqual(true, filled, places=10) self.assertTupleEqual((100, 417), fitted.shape) # Test mean dcmean = DataCleaner(max_na_frac=0, feature_na_method="mean") df["minimum X"].iloc[99] = np.nan minimum_x = dffit["minimum X"] mean_min_x = minimum_x[~minimum_x.isnull()].mean() fitted = dcmean.fit_transform(dffit, target=self.target) self.assertAlmostEqual(fitted["minimum X"].iloc[99], mean_min_x, places=10)
def test_DataCleaner_sample_na_method(self): df = self.test_df df['HOMO_energy'].loc[40] = np.nan df['HOMO_energy'].loc[110] = np.nan # Test when transform method is fill dc = DataCleaner(max_na_frac=0.9, feature_na_method="drop", na_method_fit="drop", na_method_transform="fill") dffit = df.loc[:100] fitted = dc.fit_transform(dffit, target=self.target) test_shape = tuple(np.subtract(dffit.shape, (1, 0)).tolist()) self.assertTupleEqual(fitted.shape, test_shape) # minus one sample dftrans = df.iloc[100:] tranz = dc.transform(dftrans, target=self.target) self.assertTupleEqual(tranz.shape, dftrans.shape) # Test when transform method is mean dc2 = DataCleaner(max_na_frac=0.9, feature_na_method="drop", na_method_fit="drop", na_method_transform="mean") fitted = dc2.fit_transform(dffit, target=self.target) test_shape = tuple(np.subtract(dffit.shape, (1, 0)).tolist()) self.assertTupleEqual(fitted.shape, test_shape) # minus one sample dftrans = df.loc[100:] tranz = dc2.transform(dftrans, target=self.target) self.assertTupleEqual(tranz.shape, dftrans.shape) mean = dftrans.drop(110)["HOMO_energy"].mean() self.assertAlmostEqual(tranz["HOMO_energy"].loc[110], mean)
def test_DataCleaner_emergency_na_transform_imputation(self): """For the case where a fit DataCleaner must include feature X, but in the df-to-be-transformed that feature is all nan, which makes it unable to be imputed correctly. Current implementation dictates this "emergency" be resolved by imputing with the mean of feature x from the fitted_df.""" dc = DataCleaner() # should work regardless of default df = self.test_df fit_df = df.iloc[:150] trs_df = df.iloc[151:] trs_df["range X"] = [np.nan] * trs_df.shape[0] dc.fit(fit_df, self.target) trs_df2 = dc.transform(trs_df, self.target) self.assertAlmostEqual(trs_df2["range X"].mean(), fit_df["range X"].mean()) self.assertAlmostEqual(trs_df2["range X"].std(), 0.0)
def test_DataCleaner_big_nan_handler_warning(self): """Ensure the DataCleaner throws a warning or error when the number of nan samples and fraction is high (i.e., something has gone horribly wrong in featurization!)""" dc = DataCleaner(max_na_frac=0.01) df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD")) dc.fit(df, "D") self.assertEqual(len(dc.warnings), 0) df["A"].iloc[10:20] = np.nan df["B"].iloc[:99] = np.nan dc.fit(df, "D") self.assertEqual(len(dc.warnings), 1)
def test_DataCleaner(self): """ A basic test ensuring Preprocess can handle numerical features and features/targets that may be strings but should be numbers. Returns: None """ df = self.test_df target = 'gap expt' dc = DataCleaner() # Test the case of numbers as strings df[target] = df[target].astype(str) df = dc.fit_transform(df, target) self.assertAlmostEqual(df[target].iloc[0], 0.35) # Test if there is an nan in target df[target].iloc[8] = np.nan df = dc.fit_transform(df, target) self.assertEqual(df.shape[0], self.test_df.shape[0] - 1) # Test if there is an nan in feature df['HOMO_energy'].iloc[40] = np.nan df = dc.fit_transform(df, target) self.assertEqual(df.shape[0], self.test_df.shape[0] - 2) # Test if nan threshold is exceeded for a feature df["LUMO_energy"].iloc[:-2] = [np.nan] * (df.shape[0] - 2) df = dc.fit_transform(df, target) self.assertEqual(df.shape[1], self.test_df.shape[1] - 1) # test transferability df2 = self.test_df df2 = df2.drop(columns=[target]) df2 = dc.transform(df2, target) self.assertFalse(compare_columns(df, df2, ignore=target)["mismatch"]) self.assertTrue(target not in df2.columns)