Beispiel #1
0
def test_categorical_indexer():
    train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8],
                        ["c", 10, "java", 4.8]])

    valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8],
                        ["d", 10, "python", 4.8]])

    test_x = np.array([["a", 1, "scala", 4.5]])

    dm = DataManager()

    dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"]

    dm.train_X = train_x
    dm.val_X = valid_x
    dm.test_X = test_x

    dm = categorical_indexer(dm)

    print(dm.feature_types)
    print(dm.train_X)
    print("----------------------------")
    print(dm.val_X)
    print("----------------------------")
    print(dm.test_X)
Beispiel #2
0
def test_impute_dm():
    train_x = np.array([["a", 1, "python", 4.5], ["b", 2, "c++", 6.8],
                        ["c", 10, "java", 4.8]])

    valid_x = np.array([["a", 1, "scala", 4.5], ["c", 2, "c++", 6.8],
                        ["d", 10, "python", 4.8]])

    test_x = np.array([["a", 1, "scala", 4.5]])

    train_x[2][0] = "???"
    train_x[2][2] = "???"
    valid_x[0][1] = np.nan
    test_x[0][-1] = np.nan

    dm = DataManager()

    dm.feature_types = ["Categorical", "Discrete", "Categorical", "Float"]

    dm.train_X = train_x.astype(np.object)
    dm.val_X = valid_x.astype(np.object)
    dm.test_X = test_x.astype(np.object)

    dm = impute_dm(dm, "???")

    print(dm.feature_types)
    print(dm.train_X)
    print("----------------------------")
    print(dm.val_X)
    print("----------------------------")
    print(dm.test_X)
Beispiel #3
0
def one_hot(dm: DataManager) -> DataManager:
    """
    Convert the categorical features to float with one-hot encoding
    :param dm:
    :return:
    """
    feature_types = dm.feature_types
    categorical_index = [
        i for i in range(len(feature_types))
        if feature_types[i] == "Categorical"
    ]
    other_index = [
        i for i in range(len(feature_types))
        if feature_types[i] != "Categorical"
    ]

    encoder = OneHotEncoder(handle_unknown="ignore")
    (train_x, _), (valid_x,
                   _), (test_x,
                        _) = dm.get_train(), dm.get_val(), dm.get_test()

    train_size = len(train_x)
    valid_size = 0
    test_size = 0
    if train_x is None:
        raise ValueError("train_x has no value!!!")
    if valid_x is not None and test_x is not None:
        x = np.concatenate([train_x, valid_x, test_x])
        valid_size = len(valid_x)
        test_size = len(test_x)
    elif valid_x is not None:
        x = np.concatenate([train_x, valid_x])
        valid_size = len(valid_x)
    else:
        x = train_x
    categorical_x = x[:, categorical_index]
    other_x = x[:, other_index]

    encoder.fit(categorical_x)
    categorical_x = encoder.transform(categorical_x).toarray()

    categorical_features = ["One-Hot"] * categorical_x.shape[1]
    other_features = [feature_types[i] for i in other_index]

    x = np.hstack((categorical_x, other_x)).astype(np.float)
    dm.feature_types = np.concatenate((categorical_features, other_features))

    train_x, valid_x, test_x = _split_data(x, train_size, valid_size,
                                           test_size)
    if valid_size == 0:
        valid_x = None
    if test_size == 0:
        test_x = None

    dm.train_X = train_x
    dm.val_X = valid_x
    dm.test_X = test_x

    return dm
Beispiel #4
0
    dm = normalize(dm)

    print("after normalize rescale\n")

    print(dm.train_X)
    print(dm.val_X)
    print(dm.test_X)
    print(dm.feature_types)


if __name__ == '__main__':
    np.random.seed(19941125)

    dm = DataManager()
    dm.train_X = np.random.rand(5, 5)
    dm.val_X = np.random.rand(3, 5)
    dm.test_X = np.random.rand(2, 5)
    dm.feature_types = ["Discrete", "One-Hot", "Float", "Float", "Categorical"]

    print("Original data......\n")
    print(dm.train_X)
    print(dm.val_X)
    print(dm.test_X)
    print(dm.feature_types)

    print("start test MinMaxScaler.......\n")
    test_minmax(dm)

    print("start test StandardScaler......\n")
    test_standard(dm)