def __init__(self, root, split=TRAIN, validation_size=0.2): file_name_train = 'train.csv' file_name_test = 'test.csv' dataset_path = os.path.join(root, self.name) url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv' download_file(url_train, dataset_path, file_name_train) download_file(url_test, dataset_path, file_name_test) file_path_train = os.path.join(dataset_path, file_name_train) file_path_test = os.path.join(dataset_path, file_name_train) df_train_valid = pd.read_csv(file_path_train, skiprows=20, na_values='na') df_test = pd.read_csv(file_path_test, skiprows=20, na_values='na') # TODO This is risky business since test and train might be cleaned to have different columns clean_na_(df_train_valid) clean_na_(df_test) if not (df_train_valid.columns == df_test.columns).all(): raise Exception('Cleaning lead to different set of columns for train/test') y_columns = ['class'] label_encode_df_([df_train_valid, df_test], y_columns[0]) df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst' file_name_train = 'train.csv' file_name_test = 'test.csv' file_path_train = os.path.join(dataset_path, file_name_train) file_path_test = os.path.join(dataset_path, file_name_test) file_name_z = 'train.z' fresh_download = download_file(url_train, dataset_path, file_name_z) if fresh_download: path_z = os.path.join(dataset_path, file_name_z) with open(path_z, 'rb') as f_in: with open(file_path_train, 'wb') as f_out: f_out.write(unlzw(f_in.read())) download_file(url_test, dataset_path, file_name_test) df_train_valid = pd.read_csv(file_path_train, header=None, sep=' ') y_columns = [9] df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], y_columns[0]) df_test = pd.read_csv(file_path_test, header=None, sep=' ') label_encode_df_([df_train, df_valid, df_test], y_columns[0]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename_train = 'data_train.csv' filename_test = 'data_test.csv' file_path_train = os.path.join(dataset_path, filename_train) file_path_test = os.path.join(dataset_path, filename_test) url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test' download_file(url_train, dataset_path, filename_train) download_file(url_test, dataset_path, filename_test) df_train_valid = pd.read_csv(file_path_train, header=None, skiprows=0) df_test = pd.read_csv(file_path_test, header=None, skiprows=1) # Trailing period in test file df_test[14] = df_test[14].str.rstrip('.') df_test.index += len(df_train_valid) df = pd.concat([df_train_valid, df_test]) y_columns = df.columns[-1:] one_hot_encode_df_(df, skip_columns=y_columns) label_encode_df_(df, y_columns[0]) df_train_valid, df_test = (df.loc[df_train_valid.index], df.loc[df_test.index]) df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0] # Flatten for classification
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.csv' file_path = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv' download_file(url, dataset_path, filename) df = pd.read_csv(file_path, sep=';') y_columns = ['quality'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): file_name = 'data.csv' dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00325/Sensorless_drive_diagnosis.txt' download_file(url, dataset_path, file_name) file_path = os.path.join(dataset_path, file_name) df = pd.read_csv(file_path, header=None, sep=' ') y_columns = [48] label_encode_df_(df, y_columns[0]) self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): file_name = 'data.csv' dataset_path = os.path.join(root, self.name) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data' download_file(url, dataset_path, file_name) file_path = os.path.join(dataset_path, file_name) df = pd.read_csv(file_path, header=None) y_columns = [0] label_encode_df_(df, y_columns[0]) self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.xls' file_path = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00350/' \ 'default%20of%20credit%20card%20clients.xls' download_file(url, dataset_path, filename) df = pd.read_excel(file_path, skiprows=1, index_col='ID') y_columns = ['default payment next month'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'airfoil_self_noise.dat' url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00291/airfoil_self_noise.dat' download_file(url, dataset_path, filename) file_path = os.path.join(dataset_path, filename) df = pd.read_csv(file_path, sep='\t', header=None) y_columns = [5] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'Real estate valuation data set.xlsx' url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00477/Real%20estate%20valuation%20data%20set.xlsx' download_file(url, dataset_path, filename) file_path = os.path.join(dataset_path, filename) df = pd.read_excel(file_path, index_col='No') y_columns = ['Y house price of unit area'] self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.csv' file_path = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' download_file(url, dataset_path, filename) df = pd.read_csv(file_path, header=None) y_columns = df.columns[-1:] one_hot_encode_df_(df) df_test, df_train, df_valid = split_df(df, [0.2, 0.8 - 0.8 * validation_size, 0.8 * validation_size]) normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) filename = 'data.csv' file_path: str = os.path.join(dataset_path, filename) url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' \ 'parkinsons/telemonitoring/parkinsons_updrs.data' download_file(url, dataset_path, filename) df = pd.read_csv(file_path) y_columns = ['motor_UPDRS', 'total_UPDRS'] df_train_valid = df[df['subject#'] <= 30] df_test = deepcopy(df[df['subject#'] > 30]) df_train, df_valid = split_df_on_column(df_train_valid, [1 - validation_size, validation_size], 'subject#') normalize_df_(df_train, other_dfs=[df_valid, df_test]) df_res = get_split(df_train, df_valid, df_test, split) df_res.drop(columns='subject#', inplace=True) self.x, self.y = xy_split(df_res, y_columns)
def __init__(self, root, split=TRAIN, validation_size=0.2): dataset_path = os.path.join(root, self.name) file_name_train = 'train.csv' file_name_test = 'test.csv' url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn' url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst' download_file(url_train, dataset_path, file_name_train) download_file(url_test, dataset_path, file_name_test) file_path_train = os.path.join(dataset_path, file_name_train) file_path_test = os.path.join(dataset_path, file_name_test) df_train_valid = pd.read_csv(file_path_train, sep=' ', header=None) df_test = pd.read_csv(file_path_test, sep=' ', header=None) df_test.index += len(df_train_valid) df = pd.concat([df_train_valid, df_test]) y_columns = [36] label_encode_df_(df, y_columns[0]) df_train_valid = df.loc[df_train_valid.index, :] df_test = df.loc[df_test.index, :] df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], 36) normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0]) df_res = get_split(df_train, df_valid, df_test, split) self.x, self.y = xy_split(df_res, y_columns) self.y = self.y[:, 0]