def __init__(self, root, split=TRAIN, validation_size=0.2): file_name_train = 'train.csv' file_name_test = 'test.csv' dataset_path = os.path.join(root, self.name) url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv' download_file(url_train, dataset_path, file_name_train) download_file(url_test, dataset_path, file_name_test) file_path_train = os.path.join(dataset_path, file_name_train) file_path_test = os.path.join(dataset_path, file_name_train) df_train_valid = pd.read_csv(file_path_train, skiprows=20, na_values='na') df_test = pd.read_csv(file_path_test, skiprows=20, na_values='na') # TODO This is risky business since test and train might be cleaned to have different columns clean_na_(df_train_valid) clean_na_(df_test) if not (df_train_valid.columns == df_test.columns).all(): raise Exception('Cleaning lead to different set of columns for train/test') y_columns = ['class'] label_encode_df_([df_train_valid, df_test], y_columns[0]) df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst' file_name_train = 'train.csv' file_name_test = 'test.csv' file_path_train = os.path.join(dataset_path, file_name_train) file_path_test = os.path.join(dataset_path, file_name_test) file_name_z = 'train.z' fresh_download = download_file(url_train, dataset_path, file_name_z) if fresh_download: path_z = os.path.join(dataset_path, file_name_z) with open(path_z, 'rb') as f_in: with open(file_path_train, 'wb') as f_out: f_out.write(unlzw(f_in.read())) download_file(url_test, dataset_path, file_name_test) df_train_valid = pd.read_csv(file_path_train, header=None, sep=' ') y_columns = [9] df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], y_columns[0]) df_test = pd.read_csv(file_path_test, header=None, sep=' ') label_encode_df_([df_train, df_valid, df_test], y_columns[0]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename_train = 'data_train.csv' filename_test = 'data_test.csv' file_path_train = os.path.join(dataset_path, filename_train) file_path_test = os.path.join(dataset_path, filename_test) url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' download_file(url_train, dataset_path, filename_train) download_file(url_test, dataset_path, filename_test) df_train_valid = pd.read_csv(file_path_train, header=None, skiprows=0) df_test = pd.read_csv(file_path_test, header=None, skiprows=1) # Trailing period in test file df_test[14] = df_test[14].str.rstrip('.') df_test.index += len(df_train_valid) df = pd.concat([df_train_valid, df_test]) y_columns = df.columns[-1:] one_hot_encode_df_(df, skip_columns=y_columns) label_encode_df_(df, y_columns[0]) df_train_valid, df_test = (df.loc[df_train_valid.index], df.loc[df_test.index]) df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0] # Flatten for classification
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00363/Dataset.zip' download_unzip(url, dataset_path) dataset_path = os.path.join(dataset_path, 'Dataset') # The 5th variant has the most data train_path = os.path.join(dataset_path, 'Training', 'Features_Variant_5.csv') test_path = os.path.join(dataset_path, 'Testing', 'Features_TestSet.csv') df_train_valid = pd.read_csv(train_path, header=None) df_test = pd.read_csv(test_path, header=None) y_columns = df_train_valid.columns[-1:] # Page ID is not included, but can be derived. Page IDs can not be # in both training and validation sets page_columns = list(range(29)) for i, (_, df_group) in enumerate(df_train_valid.groupby(page_columns)): df_train_valid.loc[df_group.index, 'page_id'] = i df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'page_id') df_train.drop(columns='page_id', inplace=True) df_valid.drop(columns='page_id', inplace=True) normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): file_name = 'blogData_train.csv' dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00304/BlogFeedback.zip' download_unzip(url, dataset_path) # Iterate all test csv and concatenate to one DataFrame test_dfs = [] for fn in os.listdir(dataset_path): if 'blogData_test' not in fn: continue file_path = os.path.join(dataset_path, fn) test_dfs.append(pd.read_csv(file_path, header=None)) df_test = pd.concat(test_dfs) file_path = os.path.join(dataset_path, file_name) df_train_valid = pd.read_csv(file_path, header=None) y_columns = [280] df_train_valid[y_columns[0]] = np.log(df_train_valid[y_columns[0]] + 0.01) df_test[y_columns[0]] = np.log(df_test[y_columns[0]] + 0.01) page_columns = list(range(50)) for i, (_, df_group) in enumerate(df_train_valid.groupby(page_columns)): df_train_valid.loc[df_group.index, 'page_id'] = i df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'page_id') df_train.drop(columns='page_id', inplace=True) df_valid.drop(columns='page_id', inplace=True) normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns)
def test_normalize_regression(self): clean_na_(self.df) one_hot_encode_df_(self.df) normalize_df_(self.df) # All columns should have mean = 0 self.assertAlmostEqual(self.df.mean().mean(), 0, delta=1e-9) # Columns with original std > 0 should now have std = 1 self.assertAlmostEqual(self.df.age.std(), 1, delta=1e-1)
def test_normalize_classification(self): clean_na_(self.df) label_encode_df_(self.df, 'sex') normalize_df_(self.df, skip_column='sex') # Label/omitted column should have mean > 0 self.assertGreater(self.df.sex.mean(), 0) # Other columns should have mean = 0 self.assertAlmostEqual(self.df.age.mean(), 0, delta=1e-6) # Columns with original std > 0 should now have std = 1 self.assertAlmostEqual(self.df.age.std(), 1, delta=1e-6)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.csv' file_path = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' download_file(url, dataset_path, filename) df = pd.read_csv(file_path, header=None) y_columns = df.columns[-1:] one_hot_encode_df_(df) df_test, df_train, df_valid = split_df(df, [0.2, 0.8 - 0.8 * validation_size, 0.8 * validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00206/slice_localization_data.zip' download_unzip(url, dataset_path) file_name = 'slice_localization_data.csv' file_path = os.path.join(dataset_path, file_name) df = pd.read_csv(file_path) # No patient should be in both train and test set df_train_valid = deepcopy(df.loc[df.patientId < 80, :]) # Pandas complains if it is a view df_test = deepcopy(df.loc[df.patientId >= 80, :]) # - " - df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'patientId') y_columns = ['reference'] normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) df_res = df_res.drop(columns='patientId') self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip' download_unzip(url, dataset_path) file_path_train = os.path.join(dataset_path, 'avila', 'avila-tr.txt') file_path_test = os.path.join(dataset_path, 'avila', 'avila-ts.txt') df_train_valid = pd.read_csv(file_path_train, header=None) df_test = pd.read_csv(file_path_test, header=None) y_columns = [10] label_encode_df_([df_train_valid, df_test], y_columns[0]) # Assumes encoding will be identical for train/test df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], y_columns[0]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.csv' file_path: str = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' \ 'parkinsons/telemonitoring/parkinsons_updrs.data' download_file(url, dataset_path, filename) df = pd.read_csv(file_path) y_columns = ['motor_UPDRS', 'total_UPDRS'] df_train_valid = df[df['subject#'] <= 30] df_test = deepcopy(df[df['subject#'] > 30]) df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'subject#') normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) df_res.drop(columns='subject#', inplace=True) self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) file_name_train = 'train.csv' file_name_test = 'test.csv' url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst' download_file(url_train, dataset_path, file_name_train) download_file(url_test, dataset_path, file_name_test) file_path_train = os.path.join(dataset_path, file_name_train) file_path_test = os.path.join(dataset_path, file_name_test) df_train_valid = pd.read_csv(file_path_train, sep=' ', header=None) df_test = pd.read_csv(file_path_test, sep=' ', header=None) df_test.index += len(df_train_valid) df = pd.concat([df_train_valid, df_test]) y_columns = [36] label_encode_df_(df, y_columns[0]) df_train_valid = df.loc[df_train_valid.index, :] df_test = df.loc[df_test.index, :] df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], 36) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]