def test_DataCleaner_sample_na_method(self): df = self.test_df df['HOMO_energy'].loc[40] = np.nan df['HOMO_energy'].loc[110] = np.nan # Test when transform method is fill dc = DataCleaner(max_na_frac=0.9, feature_na_method="drop", na_method_fit="drop", na_method_transform="fill") dffit = df.loc[:100] fitted = dc.fit_transform(dffit, target=self.target) test_shape = tuple(np.subtract(dffit.shape, (1, 0)).tolist()) self.assertTupleEqual(fitted.shape, test_shape) # minus one sample dftrans = df.iloc[100:] tranz = dc.transform(dftrans, target=self.target) self.assertTupleEqual(tranz.shape, dftrans.shape) # Test when transform method is mean dc2 = DataCleaner(max_na_frac=0.9, feature_na_method="drop", na_method_fit="drop", na_method_transform="mean") fitted = dc2.fit_transform(dffit, target=self.target) test_shape = tuple(np.subtract(dffit.shape, (1, 0)).tolist()) self.assertTupleEqual(fitted.shape, test_shape) # minus one sample dftrans = df.loc[100:] tranz = dc2.transform(dftrans, target=self.target) self.assertTupleEqual(tranz.shape, dftrans.shape) mean = dftrans.drop(110)["HOMO_energy"].mean() self.assertAlmostEqual(tranz["HOMO_energy"].loc[110], mean)
def test_DataCleaner_feature_na_method(self): dc = DataCleaner(max_na_frac=0, feature_na_method="drop") df = self.test_df df["LUMO_energy"].iloc[40] = np.nan df["LUMO_energy"].iloc[110] = np.nan # Test normal dropping with transformation dffit = df.iloc[:100] fitted = dc.fit_transform(dffit, target=self.target) self.assertNotIn("LUMO_energy", fitted.columns) dftrans = df.iloc[100:] tranz = dc.transform(dftrans, target=self.target) self.assertNotIn("LUMO_energy", tranz.columns) # Test filling dc2 = DataCleaner(max_na_frac=0, feature_na_method="fill") fitted = dc2.fit_transform(dffit, target=self.target) true = fitted["LUMO_energy"].iloc[39] filled = fitted["LUMO_energy"].iloc[40] self.assertAlmostEqual(true, filled, places=10) self.assertTupleEqual((100, 417), fitted.shape) # Test mean dcmean = DataCleaner(max_na_frac=0, feature_na_method="mean") df["minimum X"].iloc[99] = np.nan minimum_x = dffit["minimum X"] mean_min_x = minimum_x[~minimum_x.isnull()].mean() fitted = dcmean.fit_transform(dffit, target=self.target) self.assertAlmostEqual(fitted["minimum X"].iloc[99], mean_min_x, places=10)
def test_DataCleaner_emergency_na_transform_imputation(self): """For the case where a fit DataCleaner must include feature X, but in the df-to-be-transformed that feature is all nan, which makes it unable to be imputed correctly. Current implementation dictates this "emergency" be resolved by imputing with the mean of feature x from the fitted_df.""" dc = DataCleaner() # should work regardless of default df = self.test_df fit_df = df.iloc[:150] trs_df = df.iloc[151:] trs_df["range X"] = [np.nan] * trs_df.shape[0] dc.fit(fit_df, self.target) trs_df2 = dc.transform(trs_df, self.target) self.assertAlmostEqual(trs_df2["range X"].mean(), fit_df["range X"].mean()) self.assertAlmostEqual(trs_df2["range X"].std(), 0.0)
def test_DataCleaner(self): """ A basic test ensuring Preprocess can handle numerical features and features/targets that may be strings but should be numbers. Returns: None """ df = self.test_df target = 'gap expt' dc = DataCleaner() # Test the case of numbers as strings df[target] = df[target].astype(str) df = dc.fit_transform(df, target) self.assertAlmostEqual(df[target].iloc[0], 0.35) # Test if there is an nan in target df[target].iloc[8] = np.nan df = dc.fit_transform(df, target) self.assertEqual(df.shape[0], self.test_df.shape[0] - 1) # Test if there is an nan in feature df['HOMO_energy'].iloc[40] = np.nan df = dc.fit_transform(df, target) self.assertEqual(df.shape[0], self.test_df.shape[0] - 2) # Test if nan threshold is exceeded for a feature df["LUMO_energy"].iloc[:-2] = [np.nan] * (df.shape[0] - 2) df = dc.fit_transform(df, target) self.assertEqual(df.shape[1], self.test_df.shape[1] - 1) # test transferability df2 = self.test_df df2 = df2.drop(columns=[target]) df2 = dc.transform(df2, target) self.assertFalse(compare_columns(df, df2, ignore=target)["mismatch"]) self.assertTrue(target not in df2.columns)