def test_column_transformer( clf_dataset, remainder, # noqa: F811 transformer_weights): X_np, X = clf_dataset sk_selec1 = [0, 2] sk_selec2 = [1, 3] cu_selec1 = sk_selec1 cu_selec2 = sk_selec2 if isinstance(X, (pdDataFrame, cuDataFrame)): cu_selec1 = ['c' + str(i) for i in sk_selec1] cu_selec2 = ['c' + str(i) for i in sk_selec2] cu_transformers = [("scaler", cuStandardScaler(), cu_selec1), ("normalizer", cuNormalizer(), cu_selec2)] transformer = cuColumnTransformer(cu_transformers, remainder=remainder, transformer_weights=transformer_weights) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) assert type(t_X) == type(X) sk_transformers = [("scaler", skStandardScaler(), sk_selec1), ("normalizer", skNormalizer(), sk_selec2)] transformer = skColumnTransformer(sk_transformers, remainder=remainder, transformer_weights=transformer_weights) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_standard_scaler_sparse( failure_logger, sparse_clf_dataset, # noqa: F811 with_std): X_np, X = sparse_clf_dataset scaler = cuStandardScaler(with_mean=False, with_std=with_std, copy=True) t_X = scaler.fit_transform(X) scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) # assert type(t_X) == type(X) # assert type(r_X) == type(t_X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) if cpx.scipy.sparse.issparse(t_X): assert cpx.scipy.sparse.issparse(r_X) if scipy.sparse.issparse(t_X): assert scipy.sparse.issparse(r_X) scaler = skStandardScaler(copy=True, with_mean=False, with_std=with_std) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_maxabs_scaler_sparse(failure_logger, sparse_clf_dataset): # noqa: F811 X_np, X = sparse_clf_dataset scaler = cuMaxAbsScaler(copy=True) t_X = scaler.fit_transform(X) scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) # assert type(t_X) == type(X) # assert type(r_X) == type(t_X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) if cpx.scipy.sparse.issparse(t_X): assert cpx.scipy.sparse.issparse(r_X) if scipy.sparse.issparse(t_X): assert scipy.sparse.issparse(r_X) scaler = skMaxAbsScaler(copy=True) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_poly_features( failure_logger, clf_dataset, degree, # noqa: F811 interaction_only, include_bias, order): X_np, X = clf_dataset polyfeatures = cuPolynomialFeatures(degree=degree, order=order, interaction_only=interaction_only, include_bias=include_bias) t_X = polyfeatures.fit_transform(X) assert type(X) == type(t_X) cu_feature_names = polyfeatures.get_feature_names() if isinstance(t_X, np.ndarray): if order == 'C': assert t_X.flags['C_CONTIGUOUS'] elif order == 'F': assert t_X.flags['F_CONTIGUOUS'] polyfeatures = skPolynomialFeatures(degree=degree, order=order, interaction_only=interaction_only, include_bias=include_bias) sk_t_X = polyfeatures.fit_transform(X_np) sk_feature_names = polyfeatures.get_feature_names() assert_allclose(t_X, sk_t_X, rtol=0.1, atol=0.1) assert sk_feature_names == cu_feature_names
def test_robust_scale( failure_logger, clf_dataset, # noqa: F811 with_centering, axis, with_scaling, quantile_range): X_np, X = clf_dataset t_X = cu_robust_scale(X, axis=axis, with_centering=with_centering, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) assert type(t_X) == type(X) sk_t_X = sk_robust_scale(X_np, axis=axis, with_centering=with_centering, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) assert_allclose(t_X, sk_t_X)
def test_missing_indicator( failure_logger, int_dataset, # noqa: F811 missing_values, features): zero_filled, one_filled, nan_filled = int_dataset if missing_values == 0: X_np, X = zero_filled elif missing_values == 1: X_np, X = one_filled else: X_np, X = nan_filled indicator = cuMissingIndicator(missing_values=missing_values, features=features) ft_X = indicator.fit_transform(X) assert type(ft_X) == type(X) indicator.fit(X) t_X = indicator.transform(X) assert type(t_X) == type(X) indicator = skMissingIndicator(missing_values=missing_values, features=features) sk_ft_X = indicator.fit_transform(X_np) indicator.fit(X_np) sk_t_X = indicator.transform(X_np) assert_allclose(ft_X, sk_ft_X) assert_allclose(t_X, sk_t_X)
def test_robust_scaler( failure_logger, clf_dataset, # noqa: F811 with_centering, with_scaling, quantile_range): X_np, X = clf_dataset scaler = cuRobustScaler(with_centering=with_centering, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) assert type(t_X) == type(X) assert type(r_X) == type(t_X) scaler = skRobustScaler(with_centering=with_centering, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_make_column_transformer(clf_dataset, remainder): # noqa: F811 X_np, X = clf_dataset sk_selec1 = [0, 2] sk_selec2 = [1, 3] cu_selec1 = sk_selec1 cu_selec2 = sk_selec2 if isinstance(X, (pdDataFrame, cuDataFrame)): cu_selec1 = ['c' + str(i) for i in sk_selec1] cu_selec2 = ['c' + str(i) for i in sk_selec2] transformer = cu_make_column_transformer((cuStandardScaler(), cu_selec1), (cuNormalizer(), cu_selec2), remainder=remainder) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) assert type(t_X) == type(X) transformer = sk_make_column_transformer((skStandardScaler(), sk_selec1), (skNormalizer(), sk_selec2), remainder=remainder) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_make_column_transformer_sparse( sparse_clf_dataset, # noqa: F811 remainder, sparse_threshold): X_np, X = sparse_clf_dataset if X.format == 'csc': pytest.xfail() dataset_density = X.nnz / X.size transformer = cu_make_column_transformer( (cuStandardScaler(with_mean=False), [0, 2]), (cuNormalizer(), [1, 3]), remainder=remainder, sparse_threshold=sparse_threshold) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) if dataset_density < sparse_threshold: # Sparse input -> sparse output if dataset_density > sparse_threshold # else sparse input -> dense output assert type(t_X) == type(X) transformer = sk_make_column_transformer( (skStandardScaler(with_mean=False), [0, 2]), (skNormalizer(), [1, 3]), remainder=remainder, sparse_threshold=sparse_threshold) sk_t_X = transformer.fit_transform(X_np) assert_allclose(ft_X, sk_t_X) assert_allclose(t_X, sk_t_X)
def test_robust_scale_sparse( failure_logger, sparse_clf_dataset, # noqa: F811 axis, with_scaling, quantile_range): X_np, X = sparse_clf_dataset if X.format != 'csc' and axis == 0: X = X.tocsc() elif X.format != 'csr' and axis == 1: X = X.tocsr() t_X = cu_robust_scale(X, axis=axis, with_centering=False, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) # assert type(t_X) == type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) sk_t_X = sk_robust_scale(X_np, axis=axis, with_centering=False, with_scaling=with_scaling, quantile_range=quantile_range, copy=True) assert_allclose(t_X, sk_t_X)
def test_imputer_sparse( sparse_imputer_dataset, # noqa: F811 strategy): missing_values, X_sp, X = sparse_imputer_dataset if X.format == 'csr': pytest.skip("Skipping CSR matrices") fill_value = np.random.randint(10, size=1)[0] imputer = cuSimpleImputer(copy=True, missing_values=missing_values, strategy=strategy, fill_value=fill_value) t_X = imputer.fit_transform(X) # assert type(t_X) == type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) imputer = skSimpleImputer(copy=True, missing_values=missing_values, strategy=strategy, fill_value=fill_value) sk_t_X = imputer.fit_transform(X_sp) assert_allclose(t_X, sk_t_X)
def test_imputer( failure_logger, random_seed, int_dataset, # noqa: F811 strategy, missing_values, add_indicator): zero_filled, one_filled, nan_filled = int_dataset if missing_values == 0: X_np, X = zero_filled elif missing_values == 1: X_np, X = one_filled else: X_np, X = nan_filled np.random.seed(random_seed) fill_value = np.random.randint(10, size=1)[0] imputer = cuSimpleImputer(copy=True, missing_values=missing_values, strategy=strategy, fill_value=fill_value, add_indicator=add_indicator) t_X = imputer.fit_transform(X) assert type(t_X) == type(X) imputer = skSimpleImputer(copy=True, missing_values=missing_values, strategy=strategy, fill_value=fill_value, add_indicator=add_indicator) sk_t_X = imputer.fit_transform(X_np) assert_allclose(t_X, sk_t_X)
def test_make_column_selector(): X_np = pdDataFrame({ 'city': ['London', 'London', 'Paris', 'Sallisaw'], 'rating': [5, 3, 4, 5], 'temperature': [21., 21., 24., 28.] }) X = cudf.from_pandas(X_np) cu_transformers = [("ohe", cuOneHotEncoder(), cu_make_column_selector(dtype_exclude=np.number)), ("scaler", cuStandardScaler(), cu_make_column_selector(dtype_include=np.integer)), ("normalizer", cuNormalizer(), cu_make_column_selector(pattern="temp"))] transformer = cuColumnTransformer(cu_transformers, remainder='drop') t_X = transformer.fit_transform(X) sk_transformers = [("ohe", skOneHotEncoder(), sk_make_column_selector(dtype_exclude=np.number)), ("scaler", skStandardScaler(), sk_make_column_selector(dtype_include=np.integer)), ("normalizer", skNormalizer(), sk_make_column_selector(pattern="temp"))] transformer = skColumnTransformer(sk_transformers, remainder='drop') sk_t_X = transformer.fit_transform(X_np) assert_allclose(t_X, sk_t_X) assert type(t_X) == type(X)
def test_normalize( failure_logger, clf_dataset, axis, norm, # noqa: F811 return_norm): X_np, X = clf_dataset if return_norm: t_X, t_norms = cu_normalize(X, axis=axis, norm=norm, return_norm=return_norm) sk_t_X, sk_t_norms = sk_normalize(X_np, axis=axis, norm=norm, return_norm=return_norm) assert_allclose(t_norms, sk_t_norms) else: t_X = cu_normalize(X, axis=axis, norm=norm, return_norm=return_norm) sk_t_X = sk_normalize(X_np, axis=axis, norm=norm, return_norm=return_norm) assert type(t_X) == type(X) assert_allclose(t_X, sk_t_X)
def test_inplace_csr_row_normalize_l2(failure_logger, sparse_random_dataset): X_np, _, _, X_sparse = sparse_random_dataset if X_sparse.format != 'csr': pytest.skip('Skip non CSR matrices') inplace_csr_row_normalize_l2(X_sparse) X_np = sk_normalize(X_np, norm='l2', axis=1) assert_allclose(X_sparse, X_np)
def test_add_dummy_feature(failure_logger, clf_dataset, value): # noqa: F811 X_np, X = clf_dataset t_X = cu_add_dummy_feature(X, value=value) assert type(t_X) == type(X) sk_t_X = sk_add_dummy_feature(X_np, value=value) assert_allclose(t_X, sk_t_X)
def test_maxabs_scale(failure_logger, clf_dataset, axis): # noqa: F811 X_np, X = clf_dataset t_X = cu_maxabs_scale(X, axis=axis) assert type(t_X) == type(X) sk_t_X = sk_maxabs_scale(X_np, axis=axis) assert_allclose(t_X, sk_t_X)
def test_binarize(failure_logger, clf_dataset, threshold): # noqa: F811 X_np, X = clf_dataset t_X = cu_binarize(X, threshold=threshold, copy=True) assert type(t_X) == type(X) sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True) assert_allclose(t_X, sk_t_X)
def test_normalizer(failure_logger, clf_dataset, norm): # noqa: F811 X_np, X = clf_dataset normalizer = cuNormalizer(norm=norm, copy=True) t_X = normalizer.fit_transform(X) assert type(t_X) == type(X) normalizer = skNormalizer(norm=norm, copy=True) sk_t_X = normalizer.fit_transform(X_np) assert_allclose(t_X, sk_t_X)
def test_binarizer(failure_logger, clf_dataset, threshold): # noqa: F811 X_np, X = clf_dataset binarizer = cuBinarizer(threshold=threshold, copy=True) t_X = binarizer.fit_transform(X) assert type(t_X) == type(X) binarizer = skBinarizer(threshold=threshold, copy=True) sk_t_X = binarizer.fit_transform(X_np) assert_allclose(t_X, sk_t_X)
def test_csc_mean_variance_axis0(failure_logger, sparse_random_dataset): X_np, _, _, X_sparse = sparse_random_dataset if X_sparse.format != 'csc': pytest.skip('Skip non CSC matrices') means, variances = csc_mean_variance_axis0(X_sparse) ref_means = np.nanmean(X_np, axis=0) ref_variances = np.nanvar(X_np, axis=0) assert_allclose(means, ref_means) assert_allclose(variances, ref_variances)
def test_minmax_scale( failure_logger, clf_dataset, # noqa: F811 axis, feature_range): X_np, X = clf_dataset t_X = cu_minmax_scale(X, feature_range=feature_range, axis=axis) assert type(t_X) == type(X) sk_t_X = sk_minmax_scale(X_np, feature_range=feature_range, axis=axis) assert_allclose(t_X, sk_t_X)
def test_add_dummy_feature_sparse( failure_logger, sparse_dataset_with_coo, # noqa: F811 value): X_np, X = sparse_dataset_with_coo t_X = cu_add_dummy_feature(X, value=value) # assert type(t_X) == type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) sk_t_X = sk_add_dummy_feature(X_np, value=value) assert_allclose(t_X, sk_t_X)
def test_binarize_sparse( failure_logger, sparse_clf_dataset, # noqa: F811 threshold): X_np, X = sparse_clf_dataset t_X = cu_binarize(X, threshold=threshold, copy=True) # assert type(t_X) == type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True) assert_allclose(t_X, sk_t_X)
def test_scale_sparse( failure_logger, sparse_clf_dataset, # noqa: F811 with_std): X_np, X = sparse_clf_dataset t_X = cu_scale(X, with_mean=False, with_std=with_std, copy=True) # assert type(t_X) == type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) sk_t_X = sk_scale(X_np, with_mean=False, with_std=with_std, copy=True) assert_allclose(t_X, sk_t_X)
def test_maxabs_scaler(failure_logger, clf_dataset): # noqa: F811 X_np, X = clf_dataset scaler = cuMaxAbsScaler(copy=True) t_X = scaler.fit_transform(X) scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) assert type(t_X) == type(X) assert type(r_X) == type(t_X) scaler = skMaxAbsScaler(copy=True) sk_t_X = scaler.fit_transform(X_np) sk_r_X = scaler.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_binarizer_sparse( failure_logger, sparse_clf_dataset, # noqa: F811 threshold): X_np, X = sparse_clf_dataset binarizer = cuBinarizer(threshold=threshold, copy=True) t_X = binarizer.fit_transform(X) # assert type(t_X) == type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) binarizer = skBinarizer(threshold=threshold, copy=True) sk_t_X = binarizer.fit_transform(X_np) assert_allclose(t_X, sk_t_X)
def test_normalize_sparse( failure_logger, sparse_clf_dataset, # noqa: F811 norm): X_np, X = sparse_clf_dataset axis = 0 if X.format == 'csc' else 1 t_X = cu_normalize(X, axis=axis, norm=norm) # assert type(t_X) == type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): assert scipy.sparse.issparse(t_X) sk_t_X = sk_normalize(X_np, axis=axis, norm=norm) assert_allclose(t_X, sk_t_X)
def test_function_transformer_sparse(sparse_clf_dataset): # noqa: F811 X_np, X = sparse_clf_dataset transformer = cuFunctionTransformer(func=lambda x: x * 2, inverse_func=lambda x: x / 2, accept_sparse=True) t_X = transformer.fit_transform(X) r_X = transformer.inverse_transform(t_X) assert cpx.scipy.sparse.issparse(t_X) or scipy.sparse.issparse(t_X) assert cpx.scipy.sparse.issparse(r_X) or scipy.sparse.issparse(r_X) transformer = skFunctionTransformer(func=lambda x: x * 2, inverse_func=lambda x: x / 2, accept_sparse=True) sk_t_X = transformer.fit_transform(X_np) sk_r_X = transformer.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)
def test_function_transformer(clf_dataset): # noqa: F811 X_np, X = clf_dataset transformer = cuFunctionTransformer(func=cp.exp, inverse_func=cp.log, check_inverse=False) t_X = transformer.fit_transform(X) r_X = transformer.inverse_transform(t_X) assert type(t_X) == type(X) assert type(r_X) == type(t_X) transformer = skFunctionTransformer(func=np.exp, inverse_func=np.log, check_inverse=False) sk_t_X = transformer.fit_transform(X_np) sk_r_X = transformer.inverse_transform(sk_t_X) assert_allclose(t_X, sk_t_X) assert_allclose(r_X, sk_r_X)