def test_CdfScaler_not_inf(distribution, output_distribution): X = dfX[["parch"]] assert X.isnull().sum().sum() == 0 assert (np.abs(X) == np.inf).sum().sum() == 0 scaler = CdfScaler(distribution=distribution, output_distribution=output_distribution) Xres = scaler.fit_transform(X) assert Xres.isnull().sum().sum() == 0 assert (np.abs(Xres) == np.inf).sum().sum() == 0 assert Xres.shape == X.shape assert type(Xres) == type(X)
def test_CdfScaler_fit_vs_fit_transform(): np.random.seed(123) X = np.random.randn(2000, 10) encoder = CdfScaler(distribution="kernel", output_distribution="uniform", random_state=123) X1 = encoder.fit_transform(X) encoder_b = CdfScaler(distribution="kernel", output_distribution="uniform", random_state=123) encoder_b.fit(X) X2 = encoder_b.transform(X) assert np.abs(X1 - X2).max() <= 10**(-5)
def test_CdfScaler(): np.random.seed(123) # Array X = np.exp(np.random.randn(1000, 10)) Xc = X.copy() # DataFrame dfX = pd.DataFrame(X, columns=["col_%d" % j for j in range(X.shape[1])]) dfXc = dfX.copy() # Sparse Array Xsp = np.random.randn(1000, 10) Xsp[np.random.randn(1000, 10) <= 1] = 0 Xsp = sps.csc_matrix(Xsp) Xspc = Xsp.copy() for klass in (_CdfScaler, CdfScaler): for output_distribution in ("uniform", "normal"): for distribution in ("normal", "auto-kernel", "auto-param", "auto-nonparam", "kernel", "none", "rank"): # Array scaler = klass(distribution=distribution, output_distribution=output_distribution) scaler.fit(X) Xs = scaler.transform(X) assert Xs.shape == X.shape if distribution != "none" and output_distribution == "uniform": assert Xs.min() >= 0 assert Xs.max() <= 1 elif distribution == "none": assert (Xs == X).all() assert (X == Xc).all() # verify that X didn't change assert not pd.isnull(X).any() # DataFrame scaler = klass(distribution=distribution, output_distribution=output_distribution) scaler.fit(dfX) dfXs = scaler.transform(dfX) assert dfXs.shape == dfX.shape assert list(dfXs.columns) == list(dfX.columns) if distribution != "none" and output_distribution == "uniform": assert dfXs.min().min() >= 0 assert dfXs.max().max() <= 1 else: (dfXs == dfX).all().all() assert (dfXc == dfX).all().all() assert not dfXs.isnull().any().any() # Sparse Array scaler = klass(distribution=distribution, output_distribution=output_distribution) scaler.fit(Xsp) Xsps = scaler.transform(Xsp) assert isinstance(Xsps, sps.csc_matrix) assert (Xsps.indices == Xsp.indices).all() assert (Xsps.indptr == Xsp.indptr).all() if distribution != "none" and output_distribution == "uniform": assert Xsps.data.min() >= 0 assert Xsps.data.max() <= 1 if distribution != "none": assert (Xsps.data != Xsp.data).any() assert not (Xspc != Xsp ).todense().any() # X didn't change in the process dfX = pd.DataFrame({ "A": np.random.randn(1000), "B": np.exp(np.random.randn(1000)), "C": np.random.rand(1000), "D": np.random.randint(0, 2, size=1000), }).loc[:, ("A", "B", "C", "D")] dfXc = dfX.copy() scaler = CdfScaler(distribution="auto-param") scaler.fit(dfX) dfXs = scaler.transform(dfX) assert (dfXs["D"] == dfXc["D"]).all() assert dfXs.min().min() >= 0 assert dfXs.max().max() <= 1 assert scaler._model.distributions == ["normal", "gamma", "beta", "none"]