コード例 #1
0
    def test_clean_multi_category(self):
        train_df = pd.read_csv('tests/train.csv')
        test_df = pd.read_csv('tests/test.csv')
        target_col = 'target'
        multi_categories = [['multi_label1', 'multi_label2',
                             'multi_label3'], ['multi_label_independed']]
        cleaned_train_df, target_series, cleaned_test_df = pb.clean(
            train_df, test_df, target_col, 0.5, multi_categories)

        self.assertEqual(len(cleaned_test_df.dropna()), len(test_df))
        self.assertEqual(len(cleaned_train_df), len(target_series))
        self.assertEqual(len(cleaned_train_df.columns),
                         len(cleaned_test_df.columns))
        self.assertEqual(target_series.name, "y1:" + target_col)

        # Check no effect for original dataframe
        new_train_df = pd.read_csv('tests/train.csv')
        self.assertListEqual(
            new_train_df.columns.to_list(), train_df.columns.to_list())

        # Update Expected CSV
        # cleaned_train_df.to_csv(
        #     'tests/expected_multi_category_train.csv', index=False)
        # cleaned_test_df.to_csv(
        #     'tests/expected_multi_category_test.csv', index=False)

        expected_train_df = pd.read_csv(
            'tests/expected_multi_category_train.csv')
        expected_test_df = pd.read_csv(
            'tests/expected_multi_category_test.csv')
        self.assertListEqual(cleaned_train_df.columns.to_list(),
                             expected_train_df.columns.to_list())
        self.assertListEqual(cleaned_test_df.columns.to_list(),
                             expected_test_df.columns.to_list())
コード例 #2
0
    def test_clean(self):
        train_df = pd.read_csv('tests/train.csv')
        test_df = pd.read_csv('tests/test.csv')
        target_col = 'target'
        cleaned_train_df, target_series, cleaned_test_df = pb.clean(
            train_df, test_df, target_col, 0.5)

        self.assertEqual(len(cleaned_test_df.dropna()), len(test_df))
        self.assertEqual(len(cleaned_train_df), len(target_series))
        self.assertEqual(len(cleaned_train_df.columns),
                         len(cleaned_test_df.columns))
        self.assertEqual(target_series.name, "y1:" + target_col)

        # Check no effect for original dataframe
        new_train_df = pd.read_csv('tests/train.csv')
        self.assertListEqual(
            new_train_df.columns.to_list(), train_df.columns.to_list())

        # Update Expected CSV
        # cleaned_train_df.to_csv('tests/expected_train.csv', index=False)
        # cleaned_test_df.to_csv('tests/expected_test.csv', index=False)

        expected_train_df = pd.read_csv('tests/expected_train.csv')
        expected_test_df = pd.read_csv('tests/expected_test.csv')
        self.assertListEqual(cleaned_train_df.columns.to_list(),
                             expected_train_df.columns.to_list())
        self.assertListEqual(cleaned_test_df.columns.to_list(),
                             expected_test_df.columns.to_list())
コード例 #3
0
 def test_clean_optuna_regressor(self):
     train_df = pd.read_csv('tests/train_regression.csv')
     test_df = pd.read_csv('tests/test.csv')
     target_col = 'target'
     cleaned_train_df, target_series, cleaned_test_df = pb.clean(
         train_df, test_df, target_col)
     self.assertEqual(len(cleaned_test_df.dropna()), len(test_df))
     self.assertEqual(len(cleaned_train_df), len(target_series))
     self.assertEqual(len(cleaned_train_df.columns),
                      len(cleaned_test_df.columns))
     self.assertEqual(target_series.name, "y1:" + target_col)
コード例 #4
0
 def test_clean(self):
     train_df = pd.read_csv('tests/train.csv')
     test_df = pd.read_csv('tests/test.csv')
     target_col = 'target'
     cleaned_train_df, target_df, cleaned_test_df = pb.clean(
         train_df, test_df, target_col, 0.5)
     print(cleaned_train_df)
     print(cleaned_test_df)
     print(target_df)
     self.assertEqual(len(cleaned_test_df.dropna()), len(test_df))
     self.assertEqual(len(cleaned_train_df), len(target_df))
     self.assertEqual(len(cleaned_train_df.columns),
                      len(cleaned_test_df.columns))
     self.assertEqual(target_df.columns[0], "y1:" + target_col)