from cuml.linear_model import MBSGDClassifier as cumlMBSGClassifier from cuml.testing.utils import unit_param, quality_param, stress_param from sklearn.linear_model import SGDClassifier from cuml.datasets import make_classification from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split @pytest.fixture(scope="module", params=[ unit_param([500, 20, 10, np.float32]), unit_param([500, 20, 10, np.float64]), quality_param([5000, 100, 50, np.float32]), quality_param([5000, 100, 50, np.float64]), stress_param([500000, 1000, 500, np.float32]), stress_param([500000, 1000, 500, np.float64]), ], ids=[ '500-20-10-f32', '500-20-10-f64', '5000-100-50-f32', '5000-100-50-f64', '500000-1000-500-f32', '500000-1000-500-f64' ]) def make_dataset(request): nrows, ncols, n_info, datatype = request.param X, y = make_classification(n_samples=nrows, n_informative=n_info, n_features=ncols, random_state=10) X = X.astype(datatype) y = y.astype(datatype)
except TimeoutError: pytest.skip(f"sklearn did not finish within {t} seconds.") @pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize( "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"]) @pytest.mark.parametrize("dims", [ unit_param((3, 1)), unit_param((100, 1)), unit_param((1000, 10)), unit_param((100, 100)), unit_param((100, 300)), quality_param((10000, 10)), quality_param((10000, 50)), stress_param((100000, 1000)) ]) def test_regression_basic(datatype, loss, dims): run_regression(datatype, loss, 0, dims) @pytest.mark.parametrize( "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"]) @pytest.mark.parametrize("epsilon", [0, 0.001, 0.1]) @pytest.mark.parametrize("dims", [ quality_param((10000, 10)), quality_param((10000, 50)), quality_param((10000, 500)) ]) def test_regression_eps(loss, epsilon, dims): run_regression(np.float32, loss, epsilon, dims)
ref = cp.array([[2, 0, 0], [0, 0, 1], [1, 0, 2]]) cp.testing.assert_array_equal(cm, ref) @pytest.mark.mg @pytest.mark.parametrize('chunks', ['auto', 2, 1]) def test_confusion_matrix_binary(client, chunks): y_true = da.from_array(cp.array([0, 1, 0, 1]), chunks=chunks) y_pred = da.from_array(cp.array([1, 1, 1, 0]), chunks=chunks) tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() ref = cp.array([0, 2, 1, 1]) cp.testing.assert_array_equal(ref, cp.array([tn, fp, fn, tp])) @pytest.mark.mg @pytest.mark.parametrize('n_samples', [50, 3000, stress_param(500000)]) @pytest.mark.parametrize('dtype', [np.int32, np.int64]) @pytest.mark.parametrize('problem_type', ['binary', 'multiclass']) def test_confusion_matrix_random(n_samples, dtype, problem_type, client): upper_range = 2 if problem_type == 'binary' else 1000 y_true, y_pred, np_y_true, np_y_pred = generate_random_labels( lambda rng: rng.randint(0, upper_range, n_samples).astype(dtype), as_cupy=True) y_true, y_pred = da.from_array(y_true), da.from_array(y_pred) cm = confusion_matrix(y_true, y_pred) ref = sk_confusion_matrix(np_y_true, np_y_pred) cp.testing.assert_array_almost_equal(ref, cm, decimal=4)
cu_y_pred = cuml_kmeans.fit_predict(X) cu_score = adjusted_rand_score(cu_y_pred, y) kmeans = cluster.KMeans(random_state=random_state, n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) sk_score = adjusted_rand_score(sk_y_pred, y) assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2 @pytest.mark.parametrize('name', dataset_names) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) def test_kmeans_sklearn_comparison_default(name, nrows, random_state): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1])
import pytest import dask.array as da import numpy as np import cupy as cp from cuml.dask.datasets.blobs import make_blobs from cuml.dask.common.input_utils import DistributedDataHandler from cuml.testing.utils import unit_param, quality_param, stress_param from cuml.dask.common.part_utils import _extract_partitions @pytest.mark.parametrize('nrows', [unit_param(1e3), quality_param(1e5), stress_param(1e6)]) @pytest.mark.parametrize('ncols', [unit_param(10), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('centers', [10]) @pytest.mark.parametrize("cluster_std", [0.1]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("nparts", [unit_param(1), unit_param(7), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize("order", ['F', 'C']) def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts,
assert array_equal(result["rf_res"], pickled_model.predict(X_test)) # Confirm no crash from score pickled_model.score(X_test, np.zeros(X_test.shape[0]), predict_model="GPU") pickle_save_load(tmpdir, create_mod, assert_model) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('keys', regression_models.keys()) @pytest.mark.parametrize( 'data_size', [unit_param([500, 20, 10]), stress_param([500000, 1000, 500])]) @pytest.mark.parametrize('fit_intercept', [True, False]) def test_regressor_pickle(tmpdir, datatype, keys, data_size, fit_intercept): if data_size[0] == 500000 and datatype == np.float64 and \ ("LogisticRegression" in keys or "Ridge" in keys) and \ pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: data_size[0] = data_size[0] * pytest.max_gpu_memory // 640 data_size[1] = data_size[1] * pytest.max_gpu_memory // 640 data_size[2] = data_size[2] * pytest.max_gpu_memory // 640 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") result = {} def create_mod():
n_classes=num_classes, random_state=0, ) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=10 ) return X_train, X_test, y_train, y_test @pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize("algorithm", ["eig", "svd"]) @pytest.mark.parametrize( "nrows", [unit_param(1000), quality_param(5000), stress_param(500000)] ) @pytest.mark.parametrize( "column_info", [ unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500]) ], ) def test_linear_regression_model(datatype, algorithm, nrows, column_info): if algorithm == "svd" and nrows > 46340: pytest.skip("svd solver is not supported for the data that has more" "than 46340 rows or columns if you are using CUDA version" "10.x")
quality_param, stress_param from sklearn import datasets from sklearn.datasets import make_multilabel_classification from sklearn.decomposition import PCA as skPCA from sklearn.datasets import make_blobs from cuml.common.exceptions import NotFittedError @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('input_type', ['ndarray']) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'name', [unit_param(None), quality_param('digits'), stress_param('blobs')]) def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'digits': X, _ = datasets.load_digits(return_X_y=True) else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1)
cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True) @pytest.mark.parametrize('params', [ {'kernel': 'linear', 'C': 1}, {'kernel': 'rbf', 'C': 1, 'gamma': 1}, {'kernel': 'poly', 'C': 1, 'gamma': 1}, ]) @pytest.mark.parametrize('dataset', ['classification2', 'gaussian', 'blobs']) @pytest.mark.parametrize('n_rows', [3, unit_param(100), quality_param(1000), stress_param(5000)]) @pytest.mark.parametrize('n_cols', [2, unit_param(100), quality_param(1000), stress_param(1000)]) def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols): if (params['kernel'] == 'linear' and dataset in ['gaussian', 'classification2'] and n_rows > 1000 and n_cols >= 1000): # linear kernel will not fit the gaussian dataset, but takes very long return X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols) # Default to numpy for testing with cuml.using_output_type("numpy"): cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train)
X = dask_cudf.from_cudf(X, npartitions=2) Y_ohe = cp.array([[0., 0., 1., 0.], [0., 1., 0., 1.]]) Y_ohe = da.from_array(Y_ohe) enc = OneHotEncoder(handle_unknown='ignore') enc = enc.fit(X) df = enc.inverse_transform(Y_ohe) ref = DataFrame({'chars': [None, 'b'], 'int': [0, 2]}) assert_frame_equal(df.compute().to_pandas(), ref.to_pandas()) @pytest.mark.mg @pytest.mark.parametrize('drop', [None, 'first']) @pytest.mark.parametrize('as_array', [True, False], ids=['cupy', 'cudf']) @pytest.mark.parametrize('sparse', [True, False], ids=['sparse', 'dense']) @pytest.mark.parametrize("n_samples", [10, 1000, stress_param(50000)]) def test_onehot_random_inputs(client, drop, as_array, sparse, n_samples): X, ary = generate_inputs_from_categories(n_samples=n_samples, as_array=as_array) if as_array: dX = da.from_array(X) else: dX = dask_cudf.from_cudf(X, npartitions=1) enc = OneHotEncoder(sparse=sparse, drop=drop, categories='auto') sk_enc = SkOneHotEncoder(sparse=sparse, drop=drop, categories='auto') ohe = enc.fit_transform(dX) ref = sk_enc.fit_transform(ary) if sparse: cp.testing.assert_array_equal(ohe.compute().toarray(), ref.toarray()) else:
""" X = dataset.data tsne = TSNE(n_components=2, random_state=1, n_neighbors=DEFAULT_N_NEIGHBORS, learning_rate_method='none', method=method, min_grad_norm=1e-12, perplexity=DEFAULT_PERPLEXITY) Y = tsne.fit_transform(X) validate_embedding(X, Y) @pytest.mark.parametrize('nrows', [stress_param(2400000)]) @pytest.mark.parametrize('ncols', [stress_param(225)]) @pytest.mark.parametrize('method', ['fft', 'barnes_hut']) def test_tsne_large(nrows, ncols, method): """ This tests how TSNE handles large input """ X, y = make_blobs(n_samples=nrows, centers=8, n_features=ncols, random_state=1).astype(np.float32) tsne = TSNE(random_state=1, exaggeration_iter=1, n_iter=2, method=method,
@pytest.fixture( scope="session", params=[ unit_param({ "n_samples": 350, "n_features": 20, "n_informative": 10 }), quality_param({ "n_samples": 5000, "n_features": 200, "n_informative": 80 }), stress_param({ "n_samples": 500000, "n_features": 400, "n_informative": 180 }), ], ) def small_clf(request): X, y = make_classification( n_samples=request.param["n_samples"], n_features=request.param["n_features"], n_clusters_per_class=1, n_informative=request.param["n_informative"], random_state=123, n_classes=2, ) return X, y
from cuml.testing.utils import get_pattern, unit_param, \ quality_param, stress_param, array_equal, assert_dbscan_equal from sklearn.cluster import DBSCAN as skDBSCAN from sklearn.datasets import make_blobs from sklearn.metrics import pairwise_distances from sklearn.preprocessing import StandardScaler @pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None]) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize( 'ncols', [unit_param(20), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('out_dtype', [ unit_param("int32"), unit_param(np.int32), unit_param("int64"), unit_param(np.int64), quality_param("int32"), stress_param("int32") ]) def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): if nrows == 500000 and pytest.max_gpu_memory < 32:
X_train_df, = dask_utils.persist_across_workers(c, [X_train_df], workers=list(workers)) return X_train_df def _scale_rows(client, nrows): workers = list(client.scheduler_info()['workers'].keys()) n_workers = len(workers) return n_workers * nrows @pytest.mark.parametrize( "nrows", [unit_param(300), quality_param(1e6), stress_param(5e8)]) @pytest.mark.parametrize("ncols", [10, 30]) @pytest.mark.parametrize( "nclusters", [unit_param(5), quality_param(10), stress_param(15)]) @pytest.mark.parametrize( "n_neighbors", [unit_param(10), quality_param(4), stress_param(100)]) @pytest.mark.parametrize( "n_parts", [unit_param(1), unit_param(5), quality_param(7), stress_param(50)])
test_data = [ # ((1, 0, 1, 0, 0, 0, 0, 1), test_101c), ((0, 0, 2, 0, 0, 0, 0, 1), test_002c), ((0, 1, 0, 0, 0, 0, 0, 1), test_010c), ((1, 1, 0, 0, 0, 0, 0, 0), test_110), ((0, 1, 1, 0, 0, 0, 0, 1), test_011c), ((0, 1, 1, 0, 0, 0, 0, 1), test_011c_exog), ((1, 2, 1, 0, 0, 0, 0, 1), test_121c), ((1, 1, 1, 0, 0, 0, 0, 1), test_111c_missing), ((1, 0, 1, 1, 1, 1, 4, 0), test_101_111_4), ((5, 1, 0, 0, 0, 0, 0, 0), test_510), ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c), ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c_missing), ((1, 1, 1, 2, 0, 0, 4, 1), test_111_200_4c_missing_exog), ((1, 1, 2, 0, 1, 2, 4, 0), test_112_012_4), stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12), stress_param((1, 1, 1, 1, 1, 1, 12, 0), test_111_111_12_missing), stress_param((1, 0, 1, 1, 1, 1, 12, 1), test_111_111_12c_missing_exog), ] # Dictionary for lazy-loading of datasets # (name, dtype) -> (pandas dataframe, cuDF dataframe) lazy_data = {} # Dictionary for lazy-evaluation of reference fits # (p, d, q, P, D, Q, s, k, name, dtype) -> SARIMAXResults lazy_ref_fit = {} def extract_order(tup): """Extract the order from a tuple of parameters"""
n_samples = np_array.shape[0] n_samples_per_part = int(n_samples / n_parts) chunks = [n_samples_per_part] * n_parts chunks[-1] += n_samples % n_samples_per_part chunks = tuple(chunks) return da.from_array(np_array, chunks=(chunks, -1)) @pytest.fixture( scope="module", params=[ unit_param({'n_samples': 3000, 'n_features': 30, 'n_classes': 5, 'n_targets': 2}), quality_param({'n_samples': 8000, 'n_features': 35, 'n_classes': 12, 'n_targets': 3}), stress_param({'n_samples': 20000, 'n_features': 40, 'n_classes': 12, 'n_targets': 4}) ]) def dataset(request): X, y = make_multilabel_classification( n_samples=int(request.param['n_samples'] * 1.2), n_features=request.param['n_features'], n_classes=request.param['n_classes'], n_labels=request.param['n_classes'], length=request.param['n_targets']) new_x = [] new_y = [] for i in range(y.shape[0]): a = np.argwhere(y[i] == 1)[:, 0] if len(a) >= request.param['n_targets']: new_x.append(i) np.random.shuffle(a)
from cuml.testing.utils import get_pattern, unit_param, \ quality_param, stress_param, array_equal, assert_dbscan_equal from sklearn.cluster import DBSCAN as skDBSCAN from sklearn.datasets import make_blobs from sklearn.metrics import pairwise_distances from sklearn.preprocessing import StandardScaler @pytest.mark.mg @pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None]) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize( 'ncols', [unit_param(20), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('out_dtype', [ unit_param("int32"), unit_param(np.int32), unit_param("int64"), unit_param(np.int64), quality_param("int32"), stress_param("int32") ]) def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN
from cuml import TruncatedSVD as cuTSVD from cuml.testing.utils import get_handle from cuml.testing.utils import array_equal, unit_param, \ quality_param, stress_param from sklearn.datasets import make_blobs from sklearn.decomposition import TruncatedSVD as skTSVD from sklearn.utils import check_random_state @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'name', [unit_param(None), quality_param('random'), stress_param('blobs')]) def test_tsvd_fit(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0)
from cuml.testing.utils import quality_param from cuml.testing.utils import stress_param import dask.array as da from cuml.metrics import adjusted_rand_score from sklearn.metrics import adjusted_rand_score as sk_adjusted_rand_score from cuml.dask.common.dask_arr_utils import to_dask_cudf @pytest.mark.mg @pytest.mark.parametrize( "nrows", [unit_param(1e3), quality_param(1e5), stress_param(5e6)]) @pytest.mark.parametrize("ncols", [10, 30]) @pytest.mark.parametrize( "nclusters", [unit_param(5), quality_param(10), stress_param(50)]) @pytest.mark.parametrize( "n_parts", [unit_param(None), quality_param(7), stress_param(50)]) @pytest.mark.parametrize("delayed_predict", [True, False]) @pytest.mark.parametrize("input_type", ["dataframe", "array"]) def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, client): from cuml.dask.cluster import KMeans as cumlKMeans
from cuml.dask.linear_model import ElasticNet from cuml.dask.linear_model import Lasso from cuml.metrics import r2_score from cuml.testing.utils import unit_param, quality_param, stress_param import numpy as np @pytest.mark.mg @pytest.mark.parametrize('dtype', [np.float32, np.float64]) @pytest.mark.parametrize('alpha', [0.001]) @pytest.mark.parametrize('algorithm', ['cyclic', 'random']) @pytest.mark.parametrize( 'nrows', [unit_param(50), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('column_info', [ unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500]) ]) @pytest.mark.parametrize( 'n_parts', [unit_param(4), quality_param(32), stress_param(64)]) @pytest.mark.parametrize("delayed", [True, False]) def test_lasso(dtype, alpha, algorithm, nrows, column_info, n_parts, delayed, client): ncols, n_info = column_info X, y = make_regression(n_samples=nrows,
import pytest import numpy as np from cuml.testing.utils import array_equal, \ unit_param, stress_param import cupy as cp from cuml.dask.common.dask_arr_utils import to_dask_cudf @pytest.mark.mg @pytest.mark.parametrize( "data_info", [unit_param([1000, 20, 30]), stress_param([int(9e6), 5000, 30])]) @pytest.mark.parametrize("input_type", ["dataframe", "array"]) def test_pca_fit(data_info, input_type, client): nrows, ncols, n_parts = data_info if nrows == int(9e6) and pytest.max_gpu_memory < 48: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 256 ncols = ncols * pytest.max_gpu_memory // 256 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD