def test_split_df(self): from ucimlr.helpers import split_df df = pd.DataFrame({ 'val1': range(10), }) df5, df3, df2 = split_df(df, [0.5, 0.3, 0.2]) self.assertEqual(len(df5), 5) self.assertEqual(len(df3), 3) self.assertEqual(len(df2), 2) df0, df10 = split_df(df, [0.0, 1.0]) self.assertEqual(len(df0), 0) self.assertEqual(len(df10), 10) df10, df0 = split_df(df, [1.0, 0.0]) self.assertEqual(len(df0), 0) self.assertEqual(len(df10), 10) def not_summing_to_one(): _, _ = split_df(df, [0.1, 0.2]) self.assertRaises(ValueError, not_summing_to_one) # Check deterministic df5_, df3_, df2_ = split_df(df, [0.5, 0.3, 0.2]) self.assertTrue((df5_ == df5).all().bool()) self.assertTrue((df3_ == df3).all().bool()) self.assertTrue((df2_ == df2).all().bool())
def test_split_df_deterministic(self): from ucimlr.helpers import split_df df = pd.DataFrame({'a': list(range(10))}) df1, _, _ = split_df(df, [0.2, 0.4, 0.4]) df2, _, _ = split_df(df, [0.2, 0.7, 0.1]) df3, _, _ = split_df(df, [0.2, 0.1, 0.7]) self.assertTrue((df1 == df2).all().bool()) self.assertTrue((df2 == df3).all().bool())
def __init__(self, root, split=TRAIN, validation_size=0.2): file_name_train = 'train.csv' file_name_test = 'test.csv' dataset_path = os.path.join(root, self.name) url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv' download_file(url_train, dataset_path, file_name_train) download_file(url_test, dataset_path, file_name_test) file_path_train = os.path.join(dataset_path, file_name_train) file_path_test = os.path.join(dataset_path, file_name_train) df_train_valid = pd.read_csv(file_path_train, skiprows=20, na_values='na') df_test = pd.read_csv(file_path_test, skiprows=20, na_values='na') # TODO This is risky business since test and train might be cleaned to have different columns clean_na_(df_train_valid) clean_na_(df_test) if not (df_train_valid.columns == df_test.columns).all(): raise Exception('Cleaning lead to different set of columns for train/test') y_columns = ['class'] label_encode_df_([df_train_valid, df_test], y_columns[0]) df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename_train = 'data_train.csv' filename_test = 'data_test.csv' file_path_train = os.path.join(dataset_path, filename_train) file_path_test = os.path.join(dataset_path, filename_test) url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' download_file(url_train, dataset_path, filename_train) download_file(url_test, dataset_path, filename_test) df_train_valid = pd.read_csv(file_path_train, header=None, skiprows=0) df_test = pd.read_csv(file_path_test, header=None, skiprows=1) # Trailing period in test file df_test[14] = df_test[14].str.rstrip('.') df_test.index += len(df_train_valid) df = pd.concat([df_train_valid, df_test]) y_columns = df.columns[-1:] one_hot_encode_df_(df, skip_columns=y_columns) label_encode_df_(df, y_columns[0]) df_train_valid, df_test = (df.loc[df_train_valid.index], df.loc[df_test.index]) df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0] # Flatten for classification
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.csv' file_path = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' download_file(url, dataset_path, filename) df = pd.read_csv(file_path, header=None) y_columns = df.columns[-1:] one_hot_encode_df_(df) df_test, df_train, df_valid = split_df(df, [0.2, 0.8 - 0.8 * validation_size, 0.8 * validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns)
def not_summing_to_one(): _, _ = split_df(df, [0.1, 0.2])