import cudf def generate_dask_array(np_array, n_parts): n_samples = np_array.shape[0] n_samples_per_part = int(n_samples / n_parts) chunks = [n_samples_per_part] * n_parts chunks[-1] += n_samples % n_samples_per_part chunks = tuple(chunks) return da.from_array(np_array, chunks=(chunks, -1)) @pytest.fixture( scope="module", params=[ unit_param({'n_samples': 3000, 'n_features': 30, 'n_classes': 5, 'n_targets': 2}), quality_param({'n_samples': 8000, 'n_features': 35, 'n_classes': 12, 'n_targets': 3}), stress_param({'n_samples': 20000, 'n_features': 40, 'n_classes': 12, 'n_targets': 4}) ]) def dataset(request): X, y = make_multilabel_classification( n_samples=int(request.param['n_samples'] * 1.2), n_features=request.param['n_features'], n_classes=request.param['n_classes'], n_labels=request.param['n_classes'], length=request.param['n_targets']) new_x = [] new_y = [] for i in range(y.shape[0]):
max_iter=skit, dual=skdual) skm.fit(X_train, y_train) return skm.score(X_test, y_test) sks = with_timeout(timeout=t, target=run_sklearn) good_enough(cus, sks, nrows) except TimeoutError: pytest.skip(f"sklearn did not finish within {t} seconds.") @pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize( "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"]) @pytest.mark.parametrize("dims", [ unit_param((3, 1)), unit_param((100, 1)), unit_param((1000, 10)), unit_param((100, 100)), unit_param((100, 300)), quality_param((10000, 10)), quality_param((10000, 50)), stress_param((100000, 1000)) ]) def test_regression_basic(datatype, loss, dims): run_regression(datatype, loss, 0, dims) @pytest.mark.parametrize( "loss", ["epsilon_insensitive", "squared_epsilon_insensitive"]) @pytest.mark.parametrize("epsilon", [0, 0.001, 0.1])
X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) X_train_df, = dask_utils.persist_across_workers(c, [X_train_df], workers=list(workers)) return X_train_df def _scale_rows(client, nrows): workers = list(client.scheduler_info()['workers'].keys()) n_workers = len(workers) return n_workers * nrows @pytest.mark.parametrize( "nrows", [unit_param(300), quality_param(1e6), stress_param(5e8)]) @pytest.mark.parametrize("ncols", [10, 30]) @pytest.mark.parametrize( "nclusters", [unit_param(5), quality_param(10), stress_param(15)]) @pytest.mark.parametrize( "n_neighbors", [unit_param(10), quality_param(4), stress_param(100)]) @pytest.mark.parametrize( "n_parts", [unit_param(1), unit_param(5), quality_param(7),
from cuml import Lasso as cuLasso from cuml.linear_model import ElasticNet as cuElasticNet from cuml.metrics import r2_score from cuml.testing.utils import unit_param, quality_param, stress_param from sklearn.linear_model import Lasso, ElasticNet from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('X_type', ['ndarray']) @pytest.mark.parametrize('alpha', [0.1, 0.001]) @pytest.mark.parametrize('algorithm', ['cyclic', 'random']) @pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('column_info', [unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500])]) @pytest.mark.filterwarnings("ignore:Objective did not converge::sklearn[.*]") def test_lasso(datatype, X_type, alpha, algorithm, nrows, column_info): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) cu_lasso = cuLasso(alpha=np.array([alpha]), fit_intercept=True,
import numpy as np import pytest import cupy as cp from cuml.linear_model import MBSGDClassifier as cumlMBSGClassifier from cuml.testing.utils import unit_param, quality_param, stress_param from sklearn.linear_model import SGDClassifier from cuml.datasets import make_classification from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split @pytest.fixture(scope="module", params=[ unit_param([500, 20, 10, np.float32]), unit_param([500, 20, 10, np.float64]), quality_param([5000, 100, 50, np.float32]), quality_param([5000, 100, 50, np.float64]), stress_param([500000, 1000, 500, np.float32]), stress_param([500000, 1000, 500, np.float64]), ], ids=[ '500-20-10-f32', '500-20-10-f64', '5000-100-50-f32', '5000-100-50-f64', '500000-1000-500-f32', '500000-1000-500-f64' ]) def make_dataset(request): nrows, ncols, n_info, datatype = request.param X, y = make_classification(n_samples=nrows, n_informative=n_info,
import pytest import dask.array as da import numpy as np import cupy as cp from cuml.dask.datasets.blobs import make_blobs from cuml.dask.common.input_utils import DistributedDataHandler from cuml.testing.utils import unit_param, quality_param, stress_param from cuml.dask.common.part_utils import _extract_partitions @pytest.mark.parametrize('nrows', [unit_param(1e3), quality_param(1e5), stress_param(1e6)]) @pytest.mark.parametrize('ncols', [unit_param(10), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('centers', [10]) @pytest.mark.parametrize("cluster_std", [0.1]) @pytest.mark.parametrize("dtype", [np.float32, np.float64]) @pytest.mark.parametrize("nparts", [unit_param(1), unit_param(7), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize("order", ['F', 'C']) def test_make_blobs(nrows, ncols, centers, cluster_std, dtype,
def make_dataset(datatype, nrows, ncols, n_info): X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=0) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) return X_train, y_train, X_test @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('key', rf_models.keys()) @pytest.mark.parametrize('nrows', [unit_param(500)]) @pytest.mark.parametrize('ncols', [unit_param(16)]) @pytest.mark.parametrize('n_info', [unit_param(7)]) @pytest.mark.parametrize('n_classes', [unit_param(2), unit_param(5)]) def test_rf_regression_pickle(tmpdir, datatype, nrows, ncols, n_info, n_classes, key): result = {} if datatype == np.float64: pytest.xfail("Pickling is not supported for dataset with" " dtype float64") def create_mod(): if key == 'RandomForestRegressor': X_train, y_train, X_test = make_dataset(datatype, nrows, ncols, n_info)
import pytest from cuml.testing.utils import get_pattern, unit_param, \ quality_param, stress_param, array_equal, assert_dbscan_equal from sklearn.cluster import DBSCAN as skDBSCAN from sklearn.datasets import make_blobs from sklearn.metrics import pairwise_distances from sklearn.preprocessing import StandardScaler @pytest.mark.mg @pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None]) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize( 'ncols', [unit_param(20), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('out_dtype', [ unit_param("int32"), unit_param(np.int32), unit_param("int64"), unit_param(np.int64), quality_param("int32"), stress_param("int32") ]) def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype,
from cuml.testing.utils import unit_param from cuml.testing.utils import quality_param from cuml.testing.utils import stress_param import dask.array as da from cuml.metrics import adjusted_rand_score from sklearn.metrics import adjusted_rand_score as sk_adjusted_rand_score from cuml.dask.common.dask_arr_utils import to_dask_cudf @pytest.mark.mg @pytest.mark.parametrize( "nrows", [unit_param(1e3), quality_param(1e5), stress_param(5e6)]) @pytest.mark.parametrize("ncols", [10, 30]) @pytest.mark.parametrize( "nclusters", [unit_param(5), quality_param(10), stress_param(50)]) @pytest.mark.parametrize( "n_parts", [unit_param(None), quality_param(7), stress_param(50)]) @pytest.mark.parametrize("delayed_predict", [True, False]) @pytest.mark.parametrize("input_type", ["dataframe", "array"]) def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, client):
cuSVC = cu_svm.SVC(**params) cuSVC.fit(X_train, y_train) sklSVC = svm.SVC(**params) sklSVC.fit(X_train, y_train) compare_svm(cuSVC, sklSVC, X_train, y_train, cmp_decision_func=True) @pytest.mark.parametrize('params', [ {'kernel': 'linear', 'C': 1}, {'kernel': 'rbf', 'C': 1, 'gamma': 1}, {'kernel': 'poly', 'C': 1, 'gamma': 1}, ]) @pytest.mark.parametrize('dataset', ['classification2', 'gaussian', 'blobs']) @pytest.mark.parametrize('n_rows', [3, unit_param(100), quality_param(1000), stress_param(5000)]) @pytest.mark.parametrize('n_cols', [2, unit_param(100), quality_param(1000), stress_param(1000)]) def test_svm_skl_cmp_datasets(params, dataset, n_rows, n_cols): if (params['kernel'] == 'linear' and dataset in ['gaussian', 'classification2'] and n_rows > 1000 and n_cols >= 1000): # linear kernel will not fit the gaussian dataset, but takes very long return X_train, X_test, y_train, y_test = make_dataset(dataset, n_rows, n_cols) # Default to numpy for testing with cuml.using_output_type("numpy"): cuSVC = cu_svm.SVC(**params)
n_classes=num_classes, random_state=0, ) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=10 ) return X_train, X_test, y_train, y_test @pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize("algorithm", ["eig", "svd"]) @pytest.mark.parametrize( "nrows", [unit_param(1000), quality_param(5000), stress_param(500000)] ) @pytest.mark.parametrize( "column_info", [ unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500]) ], ) def test_linear_regression_model(datatype, algorithm, nrows, column_info): if algorithm == "svd" and nrows > 46340: pytest.skip("svd solver is not supported for the data that has more" "than 46340 rows or columns if you are using CUDA version" "10.x")
from sklearn.neighbors import NearestNeighbors import joblib from cuml.common import logger from sklearn import datasets from sklearn.cluster import KMeans from sklearn.datasets import make_blobs from sklearn.manifold import trustworthiness from sklearn.metrics import adjusted_rand_score dataset_names = ['iris', 'digits', 'wine', 'blobs'] @pytest.mark.parametrize('nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('n_feats', [unit_param(20), quality_param(100), stress_param(1000)]) def test_blobs_cluster(nrows, n_feats): data, labels = datasets.make_blobs( n_samples=nrows, n_features=n_feats, centers=5, random_state=0) embedding = cuUMAP().fit_transform(data, convert_dtype=True) if nrows < 500000: score = adjusted_rand_score(labels, KMeans(5).fit_predict(embedding)) assert score == 1.0
make_classification, make_regression, load_iris, load_breast_cancer, \ load_boston from sklearn.model_selection import train_test_split import treelite pytestmark = pytest.mark.filterwarnings("ignore: For reproducible results(.*)" "::cuml[.*]") @pytest.fixture( scope="session", params=[ unit_param({ "n_samples": 350, "n_features": 20, "n_informative": 10 }), quality_param({ "n_samples": 5000, "n_features": 200, "n_informative": 80 }), stress_param({ "n_samples": 500000, "n_features": 400, "n_informative": 180 }), ], ) def small_clf(request):
from cuml.testing.utils import get_handle from cuml import DBSCAN as cuDBSCAN from cuml.testing.utils import get_pattern, unit_param, \ quality_param, stress_param, array_equal, assert_dbscan_equal from sklearn.cluster import DBSCAN as skDBSCAN from sklearn.datasets import make_blobs from sklearn.metrics import pairwise_distances from sklearn.preprocessing import StandardScaler @pytest.mark.parametrize('max_mbytes_per_batch', [1e3, None]) @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'nrows', [unit_param(500), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize( 'ncols', [unit_param(20), quality_param(100), stress_param(1000)]) @pytest.mark.parametrize('out_dtype', [ unit_param("int32"), unit_param(np.int32), unit_param("int64"), unit_param(np.int64), quality_param("int32"), stress_param("int32") ]) def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
from cuml import PCA as cuPCA from cuml.testing.utils import get_handle, array_equal, unit_param, \ quality_param, stress_param from sklearn import datasets from sklearn.datasets import make_multilabel_classification from sklearn.decomposition import PCA as skPCA from sklearn.datasets import make_blobs from cuml.common.exceptions import NotFittedError @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('input_type', ['ndarray']) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'name', [unit_param(None), quality_param('digits'), stress_param('blobs')]) def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'digits': X, _ = datasets.load_digits(return_X_y=True) else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1,
from test_linear_model import make_regression_dataset # noqa: E402 def normalize_data(X, y): y_mean = np.mean(y) y = y - y_mean x_mean = np.mean(X, axis=0) x_scale = np.sqrt(np.var(X, axis=0) * X.shape[0]) x_scale[x_scale == 0] = 1 X = (X - x_mean) / x_scale return X, y, x_mean, x_scale, y_mean @pytest.mark.parametrize("datatype", [np.float32, np.float64]) @pytest.mark.parametrize( "nrows", [unit_param(500), quality_param(5000), stress_param(90000)]) @pytest.mark.parametrize( "column_info", [ unit_param([1, 1]), unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500]) ], ) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize("precompute", [True, False, 'precompute']) def test_lars_model(datatype, nrows, column_info, precompute, normalize): ncols, n_info = column_info
import pytest from cuml import TruncatedSVD as cuTSVD from cuml.testing.utils import get_handle from cuml.testing.utils import array_equal, unit_param, \ quality_param, stress_param from sklearn.datasets import make_blobs from sklearn.decomposition import TruncatedSVD as skTSVD from sklearn.utils import check_random_state @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('use_handle', [True, False]) @pytest.mark.parametrize( 'name', [unit_param(None), quality_param('random'), stress_param('blobs')]) def test_tsvd_fit(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else:
cluster_std=cluster_std, shuffle=False, random_state=0) cuml_kmeans = cuml.KMeans(init="k-means||", n_clusters=nclusters, random_state=random_state, output_type='numpy') preds = cuml_kmeans.fit_predict(X) assert adjusted_rand_score(cp.asnumpy(preds), cp.asnumpy(y)) >= 0.99 @pytest.mark.parametrize('name', dataset_names) @pytest.mark.parametrize('nrows', [unit_param(1000), quality_param(5000)]) def test_kmeans_sklearn_comparison(name, nrows, random_state): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1])
from cuml.dask.datasets import make_regression from cuml.dask.linear_model import ElasticNet from cuml.dask.linear_model import Lasso from cuml.metrics import r2_score from cuml.testing.utils import unit_param, quality_param, stress_param import numpy as np @pytest.mark.mg @pytest.mark.parametrize('dtype', [np.float32, np.float64]) @pytest.mark.parametrize('alpha', [0.001]) @pytest.mark.parametrize('algorithm', ['cyclic', 'random']) @pytest.mark.parametrize( 'nrows', [unit_param(50), quality_param(5000), stress_param(500000)]) @pytest.mark.parametrize('column_info', [ unit_param([20, 10]), quality_param([100, 50]), stress_param([1000, 500]) ]) @pytest.mark.parametrize( 'n_parts', [unit_param(4), quality_param(32), stress_param(64)]) @pytest.mark.parametrize("delayed", [True, False]) def test_lasso(dtype, alpha, algorithm, nrows, column_info, n_parts, delayed, client): ncols, n_info = column_info
# import pytest import numpy as np from cuml.testing.utils import array_equal, \ unit_param, stress_param import cupy as cp from cuml.dask.common.dask_arr_utils import to_dask_cudf @pytest.mark.mg @pytest.mark.parametrize( "data_info", [unit_param([1000, 20, 30]), stress_param([int(9e6), 5000, 30])]) @pytest.mark.parametrize("input_type", ["dataframe", "array"]) def test_pca_fit(data_info, input_type, client): nrows, ncols, n_parts = data_info if nrows == int(9e6) and pytest.max_gpu_memory < 48: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 256 ncols = ncols * pytest.max_gpu_memory // 256 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD