def test_make_column_selector(): X_np = pdDataFrame({'city': ['London', 'London', 'Paris', 'Sallisaw'], 'rating': [5, 3, 4, 5], 'temperature': [21., 21., 24., 28.]}) X = cudf.from_pandas(X_np) cu_transformers = [ ("ohe", cuOneHotEncoder(), cu_make_column_selector(dtype_exclude=np.number)), ("scaler", cuStandardScaler(), cu_make_column_selector(dtype_include=np.integer)), ("normalizer", cuNormalizer(), cu_make_column_selector(pattern="temp")) ] transformer = cuColumnTransformer(cu_transformers, remainder='drop') t_X = transformer.fit_transform(X) sk_transformers = [ ("ohe", skOneHotEncoder(), sk_make_column_selector(dtype_exclude=np.number)), ("scaler", skStandardScaler(), sk_make_column_selector(dtype_include=np.integer)), ("normalizer", skNormalizer(), sk_make_column_selector(pattern="temp")) ] transformer = skColumnTransformer(sk_transformers, remainder='drop') sk_t_X = transformer.fit_transform(X_np) assert_allclose(t_X, sk_t_X) assert type(t_X) == type(X)
def test_maxabs_scaler_sparse(failure_logger, sparse_clf_dataset): # noqa: F811 X_np, X = sparse_clf_dataset scaler = cuMaxAbsScaler(copy=True) t_X = scaler.fit_transform(X) scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) # assert type(t_X) == type(X) # assert type(r_X) == type(t_X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) if cpx.scipy.sparse.issparse(t_X): assert cpx.scipy.sparse.issparse(r_X) if scipy.sparse.issparse(t_X): assert scipy.sparse.issparse(r_X) scaler = skMaxAbsScaler(copy=True) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_imputer_sparse( sparse_int_dataset, strategy, # noqa: F811 missing_values): X_np, X = sparse_int_dataset if X.format == 'csr': pytest.skip("Skipping CSR matrices") X_sp = X_np.tocsc() if np.isnan(missing_values): # Adding nan when missing value is nan random_loc = np.random.choice(X.nnz, int(X.nnz * 0.1), replace=False) X_sp.data[random_loc] = np.nan X = X.copy() X.data[random_loc] = np.nan fill_value = np.random.randint(10, size=1)[0] imputer = cuSimpleImputer(copy=True, missing_values=missing_values, strategy=strategy, fill_value=fill_value) t_X = imputer.fit_transform(X) assert type(t_X) == type(X) imputer = skSimpleImputer(copy=True, missing_values=missing_values, strategy=strategy, fill_value=fill_value) sk_t_X = imputer.fit_transform(X_sp) assert_allclose(t_X, sk_t_X)
def test_make_column_transformer(clf_dataset, remainder): # noqa: F811 X_np, X = clf_dataset sk_selec1 = [0, 2] sk_selec2 = [1, 3] cu_selec1 = sk_selec1 cu_selec2 = sk_selec2 if isinstance(X, (pdDataFrame, cuDataFrame)): cu_selec1 = ['c'+str(i) for i in sk_selec1] cu_selec2 = ['c'+str(i) for i in sk_selec2] transformer = cu_make_column_transformer( (cuStandardScaler(), cu_selec1), (cuNormalizer(), cu_selec2), remainder=remainder) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) assert type(t_X) == type(X) transformer = sk_make_column_transformer( (skStandardScaler(), sk_selec1), (skNormalizer(), sk_selec2), remainder=remainder) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_missing_indicator(failure_logger, int_dataset, # noqa: F811 missing_values, features): zero_filled, one_filled, nan_filled = int_dataset if missing_values == 0: X_np, X = zero_filled elif missing_values == 1: X_np, X = one_filled else: X_np, X = nan_filled indicator = cuMissingIndicator(missing_values=missing_values, features=features) ft_X = indicator.fit_transform(X) assert type(ft_X) == type(X) indicator.fit(X) t_X = indicator.transform(X) assert type(t_X) == type(X) indicator = skMissingIndicator(missing_values=missing_values, features=features) sk_ft_X = indicator.fit_transform(X_np) indicator.fit(X_np) sk_t_X = indicator.transform(X_np) assert_allclose(ft_X, sk_ft_X) assert_allclose(t_X, sk_t_X)
def test_make_column_transformer_sparse(sparse_clf_dataset, # noqa: F811 remainder, sparse_threshold): X_np, X = sparse_clf_dataset if X.format == 'csc': pytest.xfail() dataset_density = X.nnz / X.size transformer = cu_make_column_transformer( (cuStandardScaler(with_mean=False), [0, 2]), (cuNormalizer(), [1, 3]), remainder=remainder, sparse_threshold=sparse_threshold) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) if dataset_density < sparse_threshold: # Sparse input -> sparse output if dataset_density > sparse_threshold # else sparse input -> dense output assert type(t_X) == type(X) transformer = sk_make_column_transformer( (skStandardScaler(with_mean=False), [0, 2]), (skNormalizer(), [1, 3]), remainder=remainder, sparse_threshold=sparse_threshold) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_robust_scaler_sparse( sparse_clf_dataset, # noqa: F811 with_scaling, quantile_range): X_np, X = sparse_clf_dataset if X.format != 'csc': X = X.tocsc() scaler = cuRobustScaler(with_centering=False, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) assert type(t_X) == type(X) assert type(r_X) == type(t_X) scaler = skRobustScaler(with_centering=False, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_robust_scaler( clf_dataset, with_centering, # noqa: F811 with_scaling, quantile_range): X_np, X = clf_dataset scaler = cuRobustScaler(with_centering=with_centering, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) assert type(t_X) == type(X) assert type(r_X) == type(t_X) scaler = skRobustScaler(with_centering=with_centering, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_poly_features( clf_dataset, degree, # noqa: F811 interaction_only, include_bias, order): X_np, X = clf_dataset polyfeatures = cuPolynomialFeatures(degree=degree, order=order, interaction_only=interaction_only, include_bias=include_bias) t_X = polyfeatures.fit_transform(X) assert type(X) == type(t_X) if isinstance(t_X, np.ndarray): if order == 'C': assert t_X.flags['C_CONTIGUOUS'] elif order == 'F': assert t_X.flags['F_CONTIGUOUS'] polyfeatures = skPolynomialFeatures(degree=degree, order=order, interaction_only=interaction_only, include_bias=include_bias) sk_t_X = polyfeatures.fit_transform(X_np) assert_allclose(t_X, sk_t_X, rtol=0.1, atol=0.1)
def test_standard_scaler_sparse(failure_logger, sparse_clf_dataset, # noqa: F811 with_std): X_np, X = sparse_clf_dataset scaler = cuStandardScaler(with_mean=False, with_std=with_std, copy=True) t_X = scaler.fit_transform(X) scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) # assert type(t_X) == type(X) # assert type(r_X) == type(t_X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) if cpx.scipy.sparse.issparse(t_X): assert cpx.scipy.sparse.issparse(r_X) if scipy.sparse.issparse(t_X): assert scipy.sparse.issparse(r_X) scaler = skStandardScaler(copy=True, with_mean=False, with_std=with_std) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_robust_scale_sparse( sparse_clf_dataset, # noqa: F811 axis, with_scaling, quantile_range): X_np, X = sparse_clf_dataset if X.format != 'csc' and axis == 0: X = X.tocsc() elif X.format != 'csr' and axis == 1: X = X.tocsr() t_X = cu_robust_scale(X, axis=axis, with_centering=False, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) assert type(t_X) == type(X) sk_t_X = sk_robust_scale(X_np, axis=axis, with_centering=False, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) assert_allclose(t_X, sk_t_X)
def test_add_dummy_feature_sparse(sparse_clf_dataset, value): # noqa: F811 X_np, X = sparse_clf_dataset t_X = cu_add_dummy_feature(X, value=value) assert type(t_X) == type(X) sk_t_X = sk_add_dummy_feature(X_np, value=value) assert_allclose(t_X, sk_t_X)
def test_inplace_csr_row_normalize_l2(failure_logger, sparse_random_dataset): X_np, _, _, X_sparse = sparse_random_dataset if X_sparse.format != 'csr': pytest.skip('Skip non CSR matrices') inplace_csr_row_normalize_l2(X_sparse) X_np = sk_normalize(X_np, norm='l2', axis=1) assert_allclose(X_sparse, X_np)
def test_binarize_sparse(sparse_clf_dataset, threshold): # noqa: F811 X_np, X = sparse_clf_dataset t_X = cu_binarize(X, threshold=threshold, copy=True) assert type(t_X) == type(X) sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True) assert_allclose(t_X, sk_t_X)
def test_scale_sparse(sparse_clf_dataset, with_std): # noqa: F811 X_np, X = sparse_clf_dataset t_X = cu_scale(X, copy=True, with_mean=False, with_std=with_std) assert type(t_X) == type(X) sk_t_X = sk_scale(X_np, copy=True, with_mean=False, with_std=with_std) assert_allclose(t_X, sk_t_X)
def test_binarize(failure_logger, clf_dataset, threshold): # noqa: F811 X_np, X = clf_dataset t_X = cu_binarize(X, threshold=threshold, copy=True) assert type(t_X) == type(X) sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True) assert_allclose(t_X, sk_t_X)
def test_maxabs_scale(failure_logger, clf_dataset, axis): # noqa: F811 X_np, X = clf_dataset t_X = cu_maxabs_scale(X, axis=axis) assert type(t_X) == type(X) sk_t_X = sk_maxabs_scale(X_np, axis=axis) assert_allclose(t_X, sk_t_X)
def test_minmax_scale(clf_dataset): # noqa: F811 X_np, X = clf_dataset t_X = cu_minmax_scale(X) assert type(t_X) == type(X) sk_t_X = sk_minmax_scale(X_np) assert_allclose(t_X, sk_t_X)
def test_minmax_scale(failure_logger, clf_dataset, # noqa: F811 axis, feature_range): X_np, X = clf_dataset t_X = cu_minmax_scale(X, feature_range=feature_range, axis=axis) assert type(t_X) == type(X) sk_t_X = sk_minmax_scale(X_np, feature_range=feature_range, axis=axis) assert_allclose(t_X, sk_t_X)
def test_row_norms(failure_logger, sparse_random_dataset, square): X_np, X, X_sparse_np, X_sparse = sparse_random_dataset cu_norms = cu_row_norms(X_np, squared=square) sk_norms = sk_row_norms(X, squared=square) assert_allclose(cu_norms, sk_norms) cu_norms = cu_row_norms(X_sparse, squared=square) sk_norms = sk_row_norms(X_sparse_np, squared=square) assert_allclose(cu_norms, sk_norms)
def test_inplace_csr_row_scale(failure_logger, random_seed, sparse_random_dataset): _, _, X_sparse_np, X_sparse = sparse_random_dataset if X_sparse.format != 'csr': pytest.skip() cp.random.seed(random_seed) scale = cp.random.rand(100) cu_inplace_csr_row_scale(X_sparse, scale) sk_inplace_csr_row_scale(X_sparse_np, scale.get()) assert_allclose(X_sparse, X_sparse_np)
def test_inplace_column_scale(failure_logger, random_seed, sparse_random_dataset): _, X, X_sparse_np, X_sparse = sparse_random_dataset cp.random.seed(random_seed) scale = cp.random.rand(10) cu_inplace_column_scale(X_sparse, scale) sk_inplace_column_scale(X_sparse_np, scale.get()) assert_allclose(X_sparse, X_sparse_np) with pytest.raises(Exception): cu_inplace_column_scale(X, scale)
def test_normalizer(clf_dataset, norm): # noqa: F811 X_np, X = clf_dataset normalizer = cuNormalizer(norm=norm, copy=True) t_X = normalizer.fit_transform(X) assert type(t_X) == type(X) normalizer = skNormalizer(norm=norm, copy=True) sk_t_X = normalizer.fit_transform(X_np) assert_allclose(t_X, sk_t_X)
def test_binarizer_sparse(sparse_clf_dataset, threshold): # noqa: F811 X_np, X = sparse_clf_dataset binarizer = cuBinarizer(threshold=threshold, copy=True) t_X = binarizer.fit_transform(X) assert type(t_X) == type(X) binarizer = skBinarizer(threshold=threshold, copy=True) sk_t_X = binarizer.fit_transform(X_np) assert_allclose(t_X, sk_t_X)
def test_normalize_sparse(sparse_clf_dataset, norm): # noqa: F811 X_np, X = sparse_clf_dataset axis = 0 if X.format == 'csc' else 1 t_X = cu_normalize(X, axis=axis, norm=norm) assert type(t_X) == type(X) sk_t_X = sk_normalize(X_np, axis=axis, norm=norm) assert_allclose(t_X, sk_t_X)
def test_csc_mean_variance_axis0(failure_logger, sparse_random_dataset): X_np, _, _, X_sparse = sparse_random_dataset if X_sparse.format != 'csc': pytest.skip('Skip non CSC matrices') means, variances = csc_mean_variance_axis0(X_sparse) ref_means = np.nanmean(X_np, axis=0) ref_variances = np.nanvar(X_np, axis=0) assert_allclose(means, ref_means) assert_allclose(variances, ref_variances)
def test_scale(failure_logger, clf_dataset, axis, # noqa: F811 with_mean, with_std): X_np, X = clf_dataset t_X = cu_scale(X, axis=axis, with_mean=with_mean, with_std=with_std, copy=True) assert type(t_X) == type(X) sk_t_X = sk_scale(X_np, axis=axis, with_mean=with_mean, with_std=with_std, copy=True) assert_allclose(t_X, sk_t_X)
def test_add_dummy_feature_sparse(sparse_dataset_with_coo, # noqa: F811 value): X_np, X = sparse_dataset_with_coo t_X = cu_add_dummy_feature(X, value=value) # assert type(t_X) == type(X) if cp.sparse.issparse(X): assert cp.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) sk_t_X = sk_add_dummy_feature(X_np, value=value) assert_allclose(t_X, sk_t_X)
def test_binarize_sparse(failure_logger, sparse_clf_dataset, # noqa: F811 threshold): X_np, X = sparse_clf_dataset t_X = cu_binarize(X, threshold=threshold, copy=True) # assert type(t_X) == type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True) assert_allclose(t_X, sk_t_X)
def test_normalizer_sparse(sparse_clf_dataset, norm): # noqa: F811 X_np, X = sparse_clf_dataset if X.format == 'csc': pytest.skip("Skipping CSC matrices") normalizer = cuNormalizer(norm=norm, copy=True) t_X = normalizer.fit_transform(X) assert type(t_X) == type(X) normalizer = skNormalizer(norm=norm, copy=True) sk_t_X = normalizer.fit_transform(X_np) assert_allclose(t_X, sk_t_X)