def test_FeatureReducer(self): df = self.test_df target = 'gap expt' fr = FeatureReducer() # ultra-basic case: are we reducing at least 1 feature? df = fr.fit_transform(df, target) self.assertTrue(df.shape[1] < self.test_df.shape[1]) # ensure metadata is being written correctly self.assertTrue(target not in fr.retained_features) self.assertTrue(len(list(fr.removed_features.keys())) == 2) # ensure other combinations of feature reducers are working fr = FeatureReducer(reducers=('corr', 'rebate'), n_rebate_features=40) df = fr.fit_transform(self.test_df, target) self.assertEqual(df.shape[1], 41) # 40 features + target self.assertTrue(target in df.columns) # ensure the same thing works when fraction is used fr = FeatureReducer(reducers=('rebate', ), n_rebate_features=0.2) df = fr.fit_transform(self.test_df, target) self.assertEqual(df.shape[1], 83 + 1) # test transferability df2 = self.test_df df2 = fr.transform(df2, target) self.assertListEqual(df.columns.tolist(), df2.columns.tolist())
def test_FeatureReducer_classification(self): # test classification with corr matrix (no errors) fr = FeatureReducer(reducers=("corr", )) df_class = self.test_df df_class[self.target] = [ "semiconductor" if 0.0 < g < 3.0 else "nonmetal" for g in df_class[self.target] ] df_class = fr.fit_transform(df_class, self.target) self.assertEqual(df_class.shape[0], 200)
def test_FeatureReducer_transferability(self): # ensure the same thing works when fraction is used fr = FeatureReducer(reducers=("rebate", ), n_rebate_features=0.2) df = fr.fit_transform(self.test_df, self.target) self.assertEqual(df.shape[1], 83 + 1) # test transferability df2 = deepcopy(self.test_df) df2 = fr.transform(df2, self.target) self.assertListEqual(df.columns.tolist(), df2.columns.tolist())
def test_FeatureReducer_basic(self): fr = FeatureReducer(reducers=("corr", "tree")) # ultra-basic case: are we reducing at least 1 feature? df = fr.fit_transform(self.test_df, self.target) self.assertTrue(df.shape[1] < self.test_df.shape[1]) # ensure metadata is being written correctly self.assertTrue(self.target not in fr.retained_features) self.assertTrue(len(list(fr.removed_features.keys())) == 2)
def test_manual_feature_reduction(self): fr = FeatureReducer(reducers=[], remove_features=["LUMO_element_Th"]) # ultra-basic case: are we reducing at least 1 feature? df = fr.fit_transform(self.test_df, self.target) self.assertTrue("LUMO_element_Th" not in df.columns) self.assertEqual(fr.removed_features["manual"], ["LUMO_element_Th"]) # test removing feature that doesn't exist fr = FeatureReducer(reducers=[], remove_features=["abcdefg12345!!"]) with self.assertLogs(AMM_LOGGER_BASENAME, level="INFO") as cm: # should give log warning but not throw an error fr.fit_transform(self.test_df, self.target) self.assertTrue("abcdefg12345!!" in " ".join(cm.output))
def test_FeatureReducer_pca(self): # Case where n_samples < n_features df = self.test_df.iloc[:10] fr = FeatureReducer(reducers=("pca", ), n_pca_features='auto') df_reduced = fr.fit_transform(df, self.target) self.assertTupleEqual(df_reduced.shape, (10, 11)) # Case where n_samples > n_features fsubset = [ 'HOMO_energy', 'LUMO_energy', 'gap_AO', 'minimum X', 'maximum X', 'range X', 'mean X', 'std_dev X', 'minimum row', 'maximum row', 'range row', 'mean row', 'std_dev row', 'minimum group', 'maximum group', 'range group', 'mean group', 'std_dev group' ] df = self.test_df[fsubset + [self.target]] df_reduced = fr.fit_transform(df, self.target) self.assertEqual(df_reduced.shape[0], 200) # Manually specified case of n_samples > n_features fr = FeatureReducer(reducers=('pca', ), n_pca_features=0.5) df = self.test_df df_reduced = fr.fit_transform(df, self.target) self.assertTupleEqual(df_reduced.shape, (200, 201))
def test_FeatureReducer_pca(self): # Case where n_samples < n_features df = self.test_df.iloc[:10] fr = FeatureReducer(reducers=("pca", ), n_pca_features="auto") df_reduced = fr.fit_transform(df, self.target) self.assertTupleEqual(df_reduced.shape, (10, 11)) # Case where n_samples > n_features fsubset = [ "HOMO_energy", "LUMO_energy", "gap_AO", "minimum X", "maximum X", "range X", "mean X", "std_dev X", "minimum row", "maximum row", "range row", "mean row", "std_dev row", "minimum group", "maximum group", "range group", "mean group", "std_dev group", ] df = self.test_df[fsubset + [self.target]] df_reduced = fr.fit_transform(df, self.target) self.assertEqual(df_reduced.shape[0], 200) # Manually specified case of n_samples > n_features fr = FeatureReducer(reducers=("pca", ), n_pca_features=0.5) df = self.test_df df_reduced = fr.fit_transform(df, self.target) self.assertTupleEqual(df_reduced.shape, (200, 201))
def test_saving_feature_from_removal(self): fr = FeatureReducer(reducers=("corr", ), keep_features=["maximum X"]) # ultra-basic case: are we reducing at least 1 feature? df = fr.fit_transform(self.test_df, self.target) self.assertTrue("maximum X" in df.columns)
def test_FeatureReducer_combinations(self): df = self.test_df fr = FeatureReducer(reducers=("pca", "rebate", "tree")) df_reduced = fr.fit_transform(df, self.target) self.assertGreater(df.shape[1], df_reduced.shape[1])
def test_FeatureReducer_advanced(self): # ensure other combinations of feature reducers are working fr = FeatureReducer(reducers=("corr", "rebate"), n_rebate_features=40) df = fr.fit_transform(self.test_df, self.target) self.assertEqual(df.shape[1], 41) # 40 features + self.target self.assertTrue(self.target in df.columns)