def test_categorical_indexer(): train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8], ["c", 10, "java", 4.8]]) valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8], ["d", 10, "python", 4.8]]) test_x = np.array([["a", 1, "scala", 4.5]]) dm = DataManager() dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"] dm.train_X = train_x dm.val_X = valid_x dm.test_X = test_x dm = categorical_indexer(dm) print(dm.feature_types) print(dm.train_X) print("----------------------------") print(dm.val_X) print("----------------------------") print(dm.test_X)
def test_impute_dm(): train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8], ["c", 10, "java", 4.8]]) valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8], ["d", 10, "python", 4.8]]) test_x = np.array([["a", 1, "scala", 4.5]]) train_x[2][0] = "???" train_x[2][2] = "???" valid_x[0][1] = np.nan test_x[0][-1] = np.nan dm = DataManager() dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"] dm.train_X = train_x.astype(np.object) dm.val_X = valid_x.astype(np.object) dm.test_X = test_x.astype(np.object) dm = impute_dm(dm, "???") print(dm.feature_types) print(dm.train_X) print("----------------------------") print(dm.val_X) print("----------------------------") print(dm.test_X)
def one_hot(dm: DataManager) -> DataManager: """ Convert the categorical features to float with one-hot encoding :param dm: :return: """ feature_types = dm.feature_types categorical_index = [ i for i in range(len(feature_types)) if feature_types[i] == "Categorical" ] other_index = [ i for i in range(len(feature_types)) if feature_types[i] != "Categorical" ] encoder = OneHotEncoder(handle_unknown="ignore") (train_x, _), (valid_x, _), (test_x, _) = dm.get_train(), dm.get_val(), dm.get_test() train_size = len(train_x) valid_size = 0 test_size = 0 if train_x is None: raise ValueError("train_x has no value!!!") if valid_x is not None and test_x is not None: x = np.concatenate([train_x, valid_x, test_x]) valid_size = len(valid_x) test_size = len(test_x) elif valid_x is not None: x = np.concatenate([train_x, valid_x]) valid_size = len(valid_x) else: x = train_x categorical_x = x[:, categorical_index] other_x = x[:, other_index] encoder.fit(categorical_x) categorical_x = encoder.transform(categorical_x).toarray() categorical_features = ["One-Hot"] * categorical_x.shape[1] other_features = [feature_types[i] for i in other_index] x = np.hstack((categorical_x, other_x)).astype(np.float) dm.feature_types = np.concatenate((categorical_features, other_features)) train_x, valid_x, test_x = _split_data(x, train_size, valid_size, test_size) if valid_size == 0: valid_x = None if test_size == 0: test_x = None dm.train_X = train_x dm.val_X = valid_x dm.test_X = test_x return dm
dm = normalize(dm) print("after normalize rescale\n") print(dm.train_X) print(dm.val_X) print(dm.test_X) print(dm.feature_types) if __name__ == '__main__': np.random.seed(19941125) dm = DataManager() dm.train_X = np.random.rand(5, 5) dm.val_X = np.random.rand(3, 5) dm.test_X = np.random.rand(2, 5) dm.feature_types = ["Discrete", "One-Hot", "Float", "Float", "Categorical"] print("Original data......\n") print(dm.train_X) print(dm.val_X) print(dm.test_X) print(dm.feature_types) print("start test MinMaxScaler.......\n") test_minmax(dm) print("start test StandardScaler......\n") test_standard(dm)