def __init__(self, root, split=TRAIN, validation_size=0.2):
        dataset_path = os.path.join(root, self.name)
        filename_train = 'data_train.csv'
        filename_test = 'data_test.csv'
        file_path_train = os.path.join(dataset_path, filename_train)
        file_path_test = os.path.join(dataset_path, filename_test)
        url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
        url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
        download_file(url_train, dataset_path, filename_train)
        download_file(url_test, dataset_path, filename_test)

        df_train_valid = pd.read_csv(file_path_train, header=None, skiprows=0)
        df_test = pd.read_csv(file_path_test, header=None, skiprows=1)

        # Trailing period in test file
        df_test[14] = df_test[14].str.rstrip('.')

        df_test.index += len(df_train_valid)
        df = pd.concat([df_train_valid, df_test])
        y_columns = df.columns[-1:]
        one_hot_encode_df_(df, skip_columns=y_columns)
        label_encode_df_(df, y_columns[0])
        df_train_valid, df_test = (df.loc[df_train_valid.index], df.loc[df_test.index])
        df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size])
        normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
        self.y = self.y[:, 0]  # Flatten for classification
    def __init__(self, root, split=TRAIN, validation_size=0.2):
        file_name_train = 'train.csv'
        file_name_test = 'test.csv'
        dataset_path = os.path.join(root, self.name)
        url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv'
        url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv'
        download_file(url_train, dataset_path, file_name_train)
        download_file(url_test, dataset_path, file_name_test)
        file_path_train = os.path.join(dataset_path, file_name_train)
        file_path_test = os.path.join(dataset_path, file_name_train)
        df_train_valid = pd.read_csv(file_path_train, skiprows=20, na_values='na')
        df_test = pd.read_csv(file_path_test, skiprows=20, na_values='na')

        # TODO This is risky business since test and train might be cleaned to have different columns
        clean_na_(df_train_valid)
        clean_na_(df_test)
        if not (df_train_valid.columns == df_test.columns).all():
            raise Exception('Cleaning lead to different set of columns for train/test')

        y_columns = ['class']
        label_encode_df_([df_train_valid, df_test], y_columns[0])
        df_train, df_valid = split_df(df_train_valid, [1 - validation_size, validation_size])
        normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
        df_res = get_split(df_train, df_valid, df_test, split)
        self.x, self.y = xy_split(df_res, y_columns)
        self.y = self.y[:, 0]
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.trn.Z'
     url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/shuttle/shuttle.tst'
     file_name_train = 'train.csv'
     file_name_test = 'test.csv'
     file_path_train = os.path.join(dataset_path, file_name_train)
     file_path_test = os.path.join(dataset_path, file_name_test)
     file_name_z = 'train.z'
     fresh_download = download_file(url_train, dataset_path, file_name_z)
     if fresh_download:
         path_z = os.path.join(dataset_path, file_name_z)
         with open(path_z, 'rb') as f_in:
             with open(file_path_train, 'wb') as f_out:
                 f_out.write(unlzw(f_in.read()))
         download_file(url_test, dataset_path, file_name_test)
     df_train_valid = pd.read_csv(file_path_train, header=None, sep=' ')
     y_columns = [9]
     df_train, df_valid = split_classification_df(df_train_valid,
                                                  [1 - validation_size, validation_size],
                                                  y_columns[0])
     df_test = pd.read_csv(file_path_test, header=None, sep=' ')
     label_encode_df_([df_train, df_valid, df_test], y_columns[0])
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     file_name = 'data.csv'
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00325/Sensorless_drive_diagnosis.txt'
     download_file(url, dataset_path, file_name)
     file_path = os.path.join(dataset_path, file_name)
     df = pd.read_csv(file_path, header=None, sep=' ')
     y_columns = [48]
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     file_name = 'data.csv'
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data'
     download_file(url, dataset_path, file_name)
     file_path = os.path.join(dataset_path, file_name)
     df = pd.read_csv(file_path, header=None)
     y_columns = [0]
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip'
     download_unzip(url, dataset_path)
     file_path = os.path.join(dataset_path, 'bank-additional', 'bank-additional-full.csv')
     df = pd.read_csv(file_path, sep=';')
     y_columns = ['y']
     one_hot_encode_df_(df, skip_columns=y_columns)
     label_encode_df_(df, y_columns[0])
     self.x, self.y = split_normalize_sequence(df, y_columns, validation_size, split, self.type_)
Ejemplo n.º 7
0
    def test_normalize_classification(self):
        clean_na_(self.df)
        label_encode_df_(self.df, 'sex')
        normalize_df_(self.df, skip_column='sex')

        # Label/omitted column should have mean > 0
        self.assertGreater(self.df.sex.mean(), 0)

        # Other columns should have mean = 0
        self.assertAlmostEqual(self.df.age.mean(), 0, delta=1e-6)

        # Columns with original std > 0 should now have std = 1
        self.assertAlmostEqual(self.df.age.std(), 1, delta=1e-6)
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00459/avila.zip'
     download_unzip(url, dataset_path)
     file_path_train = os.path.join(dataset_path, 'avila', 'avila-tr.txt')
     file_path_test = os.path.join(dataset_path, 'avila', 'avila-ts.txt')
     df_train_valid = pd.read_csv(file_path_train, header=None)
     df_test = pd.read_csv(file_path_test, header=None)
     y_columns = [10]
     label_encode_df_([df_train_valid, df_test], y_columns[0])  # Assumes encoding will be identical for train/test
     df_train, df_valid = split_classification_df(df_train_valid,
                                                  [1 - validation_size, validation_size],
                                                  y_columns[0])
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]
 def __init__(self, root, split=TRAIN, validation_size=0.2):
     dataset_path = os.path.join(root, self.name)
     file_name_train = 'train.csv'
     file_name_test = 'test.csv'
     url_train = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn'
     url_test = 'http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst'
     download_file(url_train, dataset_path, file_name_train)
     download_file(url_test, dataset_path, file_name_test)
     file_path_train = os.path.join(dataset_path, file_name_train)
     file_path_test = os.path.join(dataset_path, file_name_test)
     df_train_valid = pd.read_csv(file_path_train, sep=' ', header=None)
     df_test = pd.read_csv(file_path_test, sep=' ', header=None)
     df_test.index += len(df_train_valid)
     df = pd.concat([df_train_valid, df_test])
     y_columns = [36]
     label_encode_df_(df, y_columns[0])
     df_train_valid = df.loc[df_train_valid.index, :]
     df_test = df.loc[df_test.index, :]
     df_train, df_valid = split_classification_df(df_train_valid, [1 - validation_size, validation_size], 36)
     normalize_df_(df_train, other_dfs=[df_valid, df_test], skip_column=y_columns[0])
     df_res = get_split(df_train, df_valid, df_test, split)
     self.x, self.y = xy_split(df_res, y_columns)
     self.y = self.y[:, 0]