Exemple #1
0
def test_CdfScaler_not_inf(distribution, output_distribution):

    X = dfX[["parch"]]

    assert X.isnull().sum().sum() == 0
    assert (np.abs(X) == np.inf).sum().sum() == 0

    scaler = CdfScaler(distribution=distribution,
                       output_distribution=output_distribution)

    Xres = scaler.fit_transform(X)

    assert Xres.isnull().sum().sum() == 0
    assert (np.abs(Xres) == np.inf).sum().sum() == 0
    assert Xres.shape == X.shape
    assert type(Xres) == type(X)
Exemple #2
0
def test_CdfScaler_fit_vs_fit_transform():
    np.random.seed(123)
    X = np.random.randn(2000, 10)

    encoder = CdfScaler(distribution="kernel",
                        output_distribution="uniform",
                        random_state=123)
    X1 = encoder.fit_transform(X)

    encoder_b = CdfScaler(distribution="kernel",
                          output_distribution="uniform",
                          random_state=123)
    encoder_b.fit(X)
    X2 = encoder_b.transform(X)

    assert np.abs(X1 - X2).max() <= 10**(-5)
Exemple #3
0
def test_CdfScaler():
    np.random.seed(123)

    # Array
    X = np.exp(np.random.randn(1000, 10))
    Xc = X.copy()

    # DataFrame
    dfX = pd.DataFrame(X, columns=["col_%d" % j for j in range(X.shape[1])])
    dfXc = dfX.copy()

    # Sparse Array
    Xsp = np.random.randn(1000, 10)
    Xsp[np.random.randn(1000, 10) <= 1] = 0
    Xsp = sps.csc_matrix(Xsp)

    Xspc = Xsp.copy()

    for klass in (_CdfScaler, CdfScaler):
        for output_distribution in ("uniform", "normal"):
            for distribution in ("normal", "auto-kernel", "auto-param",
                                 "auto-nonparam", "kernel", "none", "rank"):

                # Array
                scaler = klass(distribution=distribution,
                               output_distribution=output_distribution)
                scaler.fit(X)

                Xs = scaler.transform(X)
                assert Xs.shape == X.shape

                if distribution != "none" and output_distribution == "uniform":
                    assert Xs.min() >= 0
                    assert Xs.max() <= 1

                elif distribution == "none":
                    assert (Xs == X).all()

                assert (X == Xc).all()  # verify that X didn't change
                assert not pd.isnull(X).any()

                # DataFrame
                scaler = klass(distribution=distribution,
                               output_distribution=output_distribution)
                scaler.fit(dfX)

                dfXs = scaler.transform(dfX)
                assert dfXs.shape == dfX.shape
                assert list(dfXs.columns) == list(dfX.columns)

                if distribution != "none" and output_distribution == "uniform":
                    assert dfXs.min().min() >= 0
                    assert dfXs.max().max() <= 1
                else:
                    (dfXs == dfX).all().all()

                assert (dfXc == dfX).all().all()
                assert not dfXs.isnull().any().any()

                # Sparse Array
                scaler = klass(distribution=distribution,
                               output_distribution=output_distribution)
                scaler.fit(Xsp)
                Xsps = scaler.transform(Xsp)
                assert isinstance(Xsps, sps.csc_matrix)
                assert (Xsps.indices == Xsp.indices).all()
                assert (Xsps.indptr == Xsp.indptr).all()
                if distribution != "none" and output_distribution == "uniform":
                    assert Xsps.data.min() >= 0
                    assert Xsps.data.max() <= 1

                if distribution != "none":
                    assert (Xsps.data != Xsp.data).any()

                assert not (Xspc != Xsp
                            ).todense().any()  # X didn't change in the process

    dfX = pd.DataFrame({
        "A": np.random.randn(1000),
        "B": np.exp(np.random.randn(1000)),
        "C": np.random.rand(1000),
        "D": np.random.randint(0, 2, size=1000),
    }).loc[:, ("A", "B", "C", "D")]

    dfXc = dfX.copy()

    scaler = CdfScaler(distribution="auto-param")
    scaler.fit(dfX)
    dfXs = scaler.transform(dfX)

    assert (dfXs["D"] == dfXc["D"]).all()
    assert dfXs.min().min() >= 0
    assert dfXs.max().max() <= 1

    assert scaler._model.distributions == ["normal", "gamma", "beta", "none"]