Exemple #1
0
    def test_split_df(self):
        from ucimlr.helpers import split_df
        df = pd.DataFrame({
            'val1': range(10),
        })
        df5, df3, df2 = split_df(df, [0.5, 0.3, 0.2])
        self.assertEqual(len(df5), 5)
        self.assertEqual(len(df3), 3)
        self.assertEqual(len(df2), 2)

        df0, df10 = split_df(df, [0.0, 1.0])
        self.assertEqual(len(df0), 0)
        self.assertEqual(len(df10), 10)

        df10, df0 = split_df(df, [1.0, 0.0])
        self.assertEqual(len(df0), 0)
        self.assertEqual(len(df10), 10)

        def not_summing_to_one():
            _, _ = split_df(df, [0.1, 0.2])

        self.assertRaises(ValueError, not_summing_to_one)

        # Check deterministic
        df5_, df3_, df2_ = split_df(df, [0.5, 0.3, 0.2])
        self.assertTrue((df5_ == df5).all().bool())
        self.assertTrue((df3_ == df3).all().bool())
        self.assertTrue((df2_ == df2).all().bool())
Exemple #2
0
 def test_split_df_deterministic(self):
     from ucimlr.helpers import split_df
     df = pd.DataFrame({'a': list(range(10))})
     df1, _, _ = split_df(df, [0.2, 0.4, 0.4])
     df2, _, _ = split_df(df, [0.2, 0.7, 0.1])
     df3, _, _ = split_df(df, [0.2, 0.1, 0.7])
     self.assertTrue((df1 == df2).all().bool())
     self.assertTrue((df2 == df3).all().bool())
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        file_name_train = 'train.csv'
        file_name_test = 'test.csv'
        dataset_path = os.path.join(root, self.name)
        url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv'
        url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv'
        download_file(url_train, dataset_path, file_name_train)
        download_file(url_test, dataset_path, file_name_test)
        file_path_train = os.path.join(dataset_path, file_name_train)
        file_path_test = os.path.join(dataset_path, file_name_train)
        df_train_valid = pd.read_csv(file_path_train, skiprows=20, na_values='na')
        df_test = pd.read_csv(file_path_test, skiprows=20, na_values='na')

        # TODO This is risky business since test and train might be cleaned to have different columns
        clean_na_(df_train_valid)
        clean_na_(df_test)
        if not (df_train_valid.columns == df_test.columns).all():
            raise Exception('Cleaning lead to different set of columns for train/test')

        y_columns = ['class']
        label_encode_df_([df_train_valid, df_test], y_columns[0])
        df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size])
        normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
        self.y = self.y[:, 0]
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename_train = 'data_train.csv'
        filename_test = 'data_test.csv'
        file_path_train = os.path.join(dataset_path, filename_train)
        file_path_test = os.path.join(dataset_path, filename_test)
        url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
        url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
        download_file(url_train, dataset_path, filename_train)
        download_file(url_test, dataset_path, filename_test)

        df_train_valid = pd.read_csv(file_path_train, header=None, skiprows=0)
        df_test = pd.read_csv(file_path_test, header=None, skiprows=1)

        # Trailing period in test file
        df_test[14] = df_test[14].str.rstrip('.')

        df_test.index += len(df_train_valid)
        df = pd.concat([df_train_valid, df_test])
        y_columns = df.columns[-1:]
        one_hot_encode_df_(df, skip_columns=y_columns)
        label_encode_df_(df, y_columns[0])
        df_train_valid, df_test = (df.loc[df_train_valid.index], df.loc[df_test.index])
        df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size])
        normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
        self.y = self.y[:, 0]  # Flatten for classification
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     filename = 'data.csv'
     file_path = os.path.join(dataset_path, filename)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
     download_file(url, dataset_path, filename)
     df = pd.read_csv(file_path, header=None)
     y_columns = df.columns[-1:]
     one_hot_encode_df_(df)
     df_test, df_train, df_valid = split_df(df, [0.2, 0.8 - 0.8 * validation_size, 0.8 * validation_size])
     normalize_df_(df_train, other_dfs=[df_valid, df_test])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
Exemple #6
0
 def not_summing_to_one():
     _, _ = split_df(df, [0.1, 0.2])