def test_clean_multi_category(self): train_df = pd.read_csv('tests/train.csv') test_df = pd.read_csv('tests/test.csv') target_col = 'target' multi_categories = [['multi_label1', 'multi_label2', 'multi_label3'], ['multi_label_independed']] cleaned_train_df, target_series, cleaned_test_df = pb.clean( train_df, test_df, target_col, 0.5, multi_categories) self.assertEqual(len(cleaned_test_df.dropna()), len(test_df)) self.assertEqual(len(cleaned_train_df), len(target_series)) self.assertEqual(len(cleaned_train_df.columns), len(cleaned_test_df.columns)) self.assertEqual(target_series.name, "y1:" + target_col) # Check no effect for original dataframe new_train_df = pd.read_csv('tests/train.csv') self.assertListEqual( new_train_df.columns.to_list(), train_df.columns.to_list()) # Update Expected CSV # cleaned_train_df.to_csv( # 'tests/expected_multi_category_train.csv', index=False) # cleaned_test_df.to_csv( # 'tests/expected_multi_category_test.csv', index=False) expected_train_df = pd.read_csv( 'tests/expected_multi_category_train.csv') expected_test_df = pd.read_csv( 'tests/expected_multi_category_test.csv') self.assertListEqual(cleaned_train_df.columns.to_list(), expected_train_df.columns.to_list()) self.assertListEqual(cleaned_test_df.columns.to_list(), expected_test_df.columns.to_list())
def test_clean(self): train_df = pd.read_csv('tests/train.csv') test_df = pd.read_csv('tests/test.csv') target_col = 'target' cleaned_train_df, target_series, cleaned_test_df = pb.clean( train_df, test_df, target_col, 0.5) self.assertEqual(len(cleaned_test_df.dropna()), len(test_df)) self.assertEqual(len(cleaned_train_df), len(target_series)) self.assertEqual(len(cleaned_train_df.columns), len(cleaned_test_df.columns)) self.assertEqual(target_series.name, "y1:" + target_col) # Check no effect for original dataframe new_train_df = pd.read_csv('tests/train.csv') self.assertListEqual( new_train_df.columns.to_list(), train_df.columns.to_list()) # Update Expected CSV # cleaned_train_df.to_csv('tests/expected_train.csv', index=False) # cleaned_test_df.to_csv('tests/expected_test.csv', index=False) expected_train_df = pd.read_csv('tests/expected_train.csv') expected_test_df = pd.read_csv('tests/expected_test.csv') self.assertListEqual(cleaned_train_df.columns.to_list(), expected_train_df.columns.to_list()) self.assertListEqual(cleaned_test_df.columns.to_list(), expected_test_df.columns.to_list())
def test_clean_optuna_regressor(self): train_df = pd.read_csv('tests/train_regression.csv') test_df = pd.read_csv('tests/test.csv') target_col = 'target' cleaned_train_df, target_series, cleaned_test_df = pb.clean( train_df, test_df, target_col) self.assertEqual(len(cleaned_test_df.dropna()), len(test_df)) self.assertEqual(len(cleaned_train_df), len(target_series)) self.assertEqual(len(cleaned_train_df.columns), len(cleaned_test_df.columns)) self.assertEqual(target_series.name, "y1:" + target_col)
def test_clean(self): train_df = pd.read_csv('tests/train.csv') test_df = pd.read_csv('tests/test.csv') target_col = 'target' cleaned_train_df, target_df, cleaned_test_df = pb.clean( train_df, test_df, target_col, 0.5) print(cleaned_train_df) print(cleaned_test_df) print(target_df) self.assertEqual(len(cleaned_test_df.dropna()), len(test_df)) self.assertEqual(len(cleaned_train_df), len(target_df)) self.assertEqual(len(cleaned_train_df.columns), len(cleaned_test_df.columns)) self.assertEqual(target_df.columns[0], "y1:" + target_col)