def test_to_dask_df(dtype, nparts, cluster): c = Client(cluster) try: from cuml.dask.common.dask_df_utils import to_dask_df from cuml.dask.datasets import make_blobs X, y = make_blobs(1e3, 25, n_parts=nparts, dtype=dtype) X_df = to_dask_df(X) y_df = to_dask_df(y) X_df_local = X_df.compute() y_df_local = y_df.compute() X_local = X.compute() y_local = y.compute() assert X_local.shape == X_df_local.shape assert y_local.shape == y_df_local.shape assert X_local.dtypes.unique() == X_df_local.dtypes.unique() assert y_local.dtypes.unique() == y_df_local.dtypes.unique() finally: c.close()
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster, order, output): c = Client(cluster) try: X, y = make_blobs(nrows, ncols, centers=centers, cluster_std=cluster_std, dtype=dtype, n_parts=nparts, output=output, order=order) assert X.npartitions == nparts assert y.npartitions == nparts X_local = X.compute() y_local = y.compute() assert X_local.shape == (nrows, ncols) if output == 'dataframe': assert len(y_local[0].unique()) == centers assert X_local.dtypes.unique() == [dtype] assert y_local.shape == (nrows, 1) elif output == 'array': import cupy as cp assert len(cp.unique(y_local)) == centers assert y_local.dtype == dtype assert y_local.shape == (nrows, ) finally: c.close()
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster, output): c = Client(cluster) try: from cuml.dask.datasets import make_blobs X, y = make_blobs(nrows, ncols, centers=centers, cluster_std=cluster_std, dtype=dtype, n_parts=nparts, output=output) assert X.npartitions == nparts assert y.npartitions == nparts X = X.compute() y = y.compute() assert X.shape == (nrows, ncols) assert y.shape == (nrows, 1) if output == 'dataframe': assert len(y[0].unique()) == centers assert X.dtypes.unique() == [dtype] elif output == 'array': import cupy as cp assert len(cp.unique(y)) == centers assert y.dtype == dtype finally: c.close()
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import PCA as daskPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=1.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) cupca = daskPCA(n_components=20, whiten=True) cupca.fit_transform(X_cudf) if owns_cluster: client.close() cluster.close()
def test_getattr(client): # Test getattr on local param kmeans_model = KMeans(client=client) # Test AttributeError with pytest.raises(AttributeError): kmeans_model.cluster_centers_ assert kmeans_model.client is not None # Test getattr on local_model param with a non-distributed model X, y = make_blobs(n_samples=5, n_features=5, centers=2, n_parts=2, cluster_std=0.01, random_state=10) kmeans_model.fit(X) assert kmeans_model.cluster_centers_ is not None assert isinstance(kmeans_model.cluster_centers_, cupy.ndarray) # Test getattr on trained distributed model X, y = load_text_corpus(client) nb_model = MultinomialNB(client=client) nb_model.fit(X, y) assert nb_model.feature_count_ is not None assert isinstance(nb_model.feature_count_, cupy.ndarray)
def test_score(nrows, ncols, nclusters, n_parts, input_type, client): from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) y = y_train elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) actual_score = cumlModel.score(X_train) local_model = cumlModel.get_combined_model() expected_score = local_model.score(X_train.compute()) assert abs(actual_score - expected_score) < 1e-3
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster): c = Client(cluster) try: from cuml.dask.datasets import make_blobs X, y = make_blobs(nrows, ncols, centers=centers, cluster_std=cluster_std, dtype=dtype, n_parts=nparts) assert X.npartitions == nparts assert y.npartitions == nparts X = X.compute() y = y.compute() assert X.shape == (nrows, ncols) assert y.shape == (nrows, 1) assert len(y[0].unique()) == centers assert X.dtypes.unique() == [dtype] finally: c.close()
def test_transform(nrows, ncols, nclusters, n_parts, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) y = y.astype('int64') wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) labels = cp.squeeze(y_train.compute().to_pandas().values) elif input_type == "array": X_train, y_train = X, y labels = cp.squeeze(y_train.compute()) cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) xformed = cumlModel.transform(X_train).compute() if input_type == "dataframe": xformed = cp.array(xformed if len(xformed.shape) == 1 else xformed.as_gpu_matrix()) if nclusters == 1: # series shape is (nrows,) not (nrows, 1) but both are valid # and equivalent for this test assert xformed.shape in [(nrows, nclusters), (nrows,)] else: assert xformed.shape == (nrows, nclusters) # The argmin of the transformed values should be equal to the labels # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1) xformed_labels = cp.argmin(xformed.reshape((int(nrows), int(nclusters))), axis=1) assert sk_adjusted_rand_score(cp.asnumpy(labels), cp.asnumpy(xformed_labels)) finally: client.close()
def test_weighted_kmeans(nrows, ncols, nclusters, n_parts, client): cluster_std = 10000.0 from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs # Using fairly high variance between points in clusters wt = cp.array([0.00001 for j in range(nrows)]) bound = nclusters * 100000 # Open the space really large centers = cp.random.uniform(-bound, bound, size=(nclusters, ncols)) X_cudf, y = make_blobs(n_samples=nrows, n_features=ncols, centers=centers, n_parts=n_parts, cluster_std=cluster_std, shuffle=False, verbose=False, random_state=10) # Choose one sample from each label and increase its weight for i in range(nclusters): wt[cp.argmax(cp.array(y.compute()) == i).item()] = 5000.0 cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) chunk_parts = int(nrows / n_parts) sample_weights = da.from_array(wt, chunks=(chunk_parts, )) cumlModel.fit(X_cudf, sample_weight=sample_weights) X = X_cudf.compute() labels_ = cumlModel.predict(X_cudf).compute() cluster_centers_ = cumlModel.cluster_centers_ for i in range(nrows): label = labels_[i] actual_center = cluster_centers_[label] diff = sum(abs(X[i] - actual_center)) # The large weight should be the centroid if wt[i] > 1.0: assert diff < 1.0 # Otherwise it should be pretty far away else: assert diff > 1000.0
def test_transform(nrows, ncols, nclusters, n_parts, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=False, shuffle=False, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) labels = np.squeeze(y.compute().to_pandas().values) xformed = cumlModel.transform(X_cudf).compute() if nclusters == 1: # series shape is (nrows,) not (nrows, 1) but both are valid # and equivalent for this test assert xformed.shape in [(nrows, nclusters), (nrows, )] else: assert xformed.shape == (nrows, nclusters) xformed = cp.array(xformed if len(xformed.shape) == 1 else xformed.as_gpu_matrix()) # The argmin of the transformed values should be equal to the labels # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1) xformed_labels = cp.argmin(xformed.reshape( (int(nrows), int(nclusters))), axis=1) assert adjusted_rand_score(labels, cp.squeeze(xformed_labels.get())) finally: client.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=False, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) cumlLabels = cumlModel.predict(X_cudf, delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None and n_parts < n_workers: assert cumlLabels.npartitions == n_parts else: assert cumlLabels.npartitions == n_workers cumlPred = cp.array(cumlLabels.compute()) assert cumlPred.shape[0] == nrows assert np.max(cumlPred) == nclusters - 1 assert np.min(cumlPred) == 0 labels = np.squeeze(y.compute().to_pandas().values) score = adjusted_rand_score(labels, cp.squeeze(cumlPred.get())) print(str(score)) assert 1.0 == score finally: client.close()
def test_score(nrows, ncols, nclusters, n_parts, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=False, shuffle=False, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) actual_score = cumlModel.score(X_cudf) X = cp.array(X_cudf.compute().as_gpu_matrix()) predictions = cumlModel.predict(X_cudf).compute() predictions = cp.array(predictions) centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix()) expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = centers[label] dist = np.sqrt(np.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1 * expected_score) \ >= actual_score - SCORE_EPS finally: client.close()
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, client): from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, random_state=10) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) cumlLabels = cumlModel.predict(X_train, delayed=delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None: parts_len = n_parts else: parts_len = n_workers if input_type == "dataframe": assert cumlLabels.npartitions == parts_len cumlPred = cumlLabels.compute().values labels = y_train.compute().values elif input_type == "array": assert len(cumlLabels.chunks[0]) == parts_len cumlPred = cp.array(cumlLabels.compute()) labels = cp.squeeze(y_train.compute()) assert cumlPred.shape[0] == nrows assert cp.max(cumlPred) == nclusters - 1 assert cp.min(cumlPred) == 0 score = adjusted_rand_score(labels, cumlPred) print(str(score)) assert 1.0 == score
def test_pca_fit(nrows, ncols, n_parts, input_type, cluster): client = Client(cluster) try: from cuml.dask.decomposition import PCA as daskPCA from sklearn.decomposition import PCA from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) try: cupca = daskPCA(n_components=5, whiten=True) cupca.fit(X_train) except Exception as e: print(str(e)) skpca = PCA(n_components=5, whiten=True, svd_solver="full") skpca.fit(X_cpu) from cuml.test.utils import array_equal all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign) finally: client.close()
def test_pca_fit(data_info, input_type, cluster): client = Client(cluster) nrows, ncols, n_parts = data_info try: from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_train) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X_cpu) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] finally: client.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit(data_info, input_type, client): nrows, ncols, n_parts = data_info if nrows == int(9e6) and pytest.max_gpu_memory < 48: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 256 ncols = ncols * pytest.max_gpu_memory // 256 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_train) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X_cpu) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.to_numpy() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=0.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) X = X_cudf.compute().to_pandas().values cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_cudf) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] if owns_cluster: client.close() cluster.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import PCA as daskPCA from sklearn.decomposition import PCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=0.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) X = X_cudf.compute().to_pandas().values cupca = daskPCA(n_components=5, whiten=True) cupca.fit(X_cudf) skpca = PCA(n_components=5, whiten=True, svd_solver="full") skpca.fit(X) from cuml.test.utils import array_equal all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] if owns_cluster: client.close() cluster.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster): client = Client(cluster) try: from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=True, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=1, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) cumlLabels = cumlModel.predict(X_cudf) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None and n_parts < n_workers: assert cumlLabels.npartitions == n_parts else: assert cumlLabels.npartitions == n_workers from sklearn.metrics import adjusted_rand_score cumlPred = cumlLabels.compute().to_pandas().values assert cumlPred.shape[0] == nrows assert np.max(cumlPred) == nclusters - 1 assert np.min(cumlPred) == 0 labels = y.compute().to_pandas().values score = adjusted_rand_score(labels.reshape(labels.shape[0]), cumlPred) assert 1.0 == score finally: client.close()
def test_pca_fit_transform_fp32_noncomponents(nrows, ncols, n_parts, client): # Tests the case when n_components is not passed for MG scenarios from cuml.dask.decomposition import PCA as daskPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=1.5, random_state=10, dtype=np.float32) cupca = daskPCA(whiten=False) res = cupca.fit_transform(X_cudf) res = res.compute() assert res.shape[0] == nrows and res.shape[1] == 20
def test_pca_fit_transform_fp32(data_info, client): nrows, ncols, n_parts = data_info from cuml.dask.decomposition import TruncatedSVD as daskTPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=1.5, random_state=10, dtype=np.float32) cutsvd = daskTPCA(n_components=20) cutsvd.fit_transform(X_cudf)
def test_pca_fit_transform_fp64(nrows, ncols, n_parts, client): from cuml.dask.decomposition import PCA as daskPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=1.5, random_state=10, dtype=np.float64) cupca = daskPCA(n_components=30, whiten=False) res = cupca.fit_transform(X_cudf) res = res.compute() assert res.shape[0] == nrows and res.shape[1] == 30
def test_pca_fit(nrows, ncols, n_parts, cluster): client = Client(cluster) try: from cuml.dask.decomposition import PCA as daskPCA from sklearn.decomposition import PCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=0.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) print(str(X_cudf.head(3))) try: cupca = daskPCA(n_components=5, whiten=True) cupca.fit(X_cudf) except Exception as e: print(str(e)) X = X_cudf.compute().to_pandas().values skpca = PCA(n_components=5, whiten=True, svd_solver="full") skpca.fit(X) from cuml.test.utils import array_equal all_attr = ['singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_'] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cupca, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign) finally: client.close()
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, cluster): client = Client(cluster) try: from cuml.dask.decomposition import PCA as daskPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=1.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) cupca = daskPCA(n_components=20, whiten=True) cupca.fit_transform(X_cudf) finally: client.close()
def test_transform(nrows, ncols, nclusters, n_parts, cluster): client = Client(cluster) try: from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X_cudf, y = make_blobs(nrows, ncols, nclusters, n_parts, cluster_std=0.01, verbose=True, random_state=10) wait(X_cudf) cumlModel = cumlKMeans(verbose=0, init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_cudf) labels = y.compute().to_pandas().values labels = labels.reshape(labels.shape[0]) xformed = cumlModel.transform(X_cudf).compute() assert xformed.shape == (nrows, nclusters) # The argmin of the transformed values should be equal to the labels xformed_labels = np.argmin(xformed.to_pandas().to_numpy(), axis=1) from sklearn.metrics import adjusted_rand_score assert adjusted_rand_score(labels, xformed_labels) finally: client.close()
def test_getattr(cluster): client = Client(cluster) try: # Test getattr on local param kmeans_model = KMeans(client=client) assert kmeans_model.client is not None # Test getattr on local_model param with a non-distributed model X, y = make_blobs(n_samples=5, n_features=5, centers=2, n_parts=2, cluster_std=0.01, random_state=10) wait(X) kmeans_model.fit(X) assert kmeans_model.cluster_centers_ is not None assert isinstance(kmeans_model.cluster_centers_, cupy.core.ndarray) # Test getattr on trained distributed model X, y = load_text_corpus(client) print(str(X.compute())) nb_model = MultinomialNB(client=client) nb_model.fit(X, y) assert nb_model.feature_count_ is not None assert isinstance(nb_model.feature_count_, cupy.core.ndarray) finally: client.close()
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, cluster): client = Client(cluster) try: from cuml.dask.decomposition import TruncatedSVD as daskTPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=1.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) cutsvd = daskTPCA(n_components=20) cutsvd.fit_transform(X_cudf) finally: client.close()
def test_pca_fit_transform_fp64(nrows, ncols, n_parts, cluster): client = Client(cluster) try: from cuml.dask.decomposition import PCA as daskPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=1.5, random_state=10, dtype=np.float64) wait(X_cudf) cupca = daskPCA(n_components=30, whiten=False) cupca.fit_transform(X_cudf) finally: client.close()
def test_getattr(cluster): client = Client(cluster) # Test getattr on local param kmeans_model = KMeans(client=client) assert kmeans_model.client is not None # Test getattr on local_model param with a non-distributed model X_cudf, y = make_blobs(5, 5, 2, 2, cluster_std=0.01, verbose=False, random_state=10) wait(X_cudf) kmeans_model.fit(X_cudf) assert kmeans_model.cluster_centers_ is not None assert isinstance(kmeans_model.cluster_centers_, cudf.DataFrame) # Test getattr on trained distributed model X, y = load_text_corpus(client) print(str(X.compute())) nb_model = MultinomialNB(client=client) nb_model.fit(X, y) assert nb_model.feature_count_ is not None assert isinstance(nb_model.feature_count_, cupy.core.ndarray)
def test_score(nrows, ncols, nclusters, n_parts, input_type, cluster): client = None try: client = Client(cluster) from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, shuffle=False, random_state=10) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) y = y_train elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) actual_score = cumlModel.score(X_train) predictions = cumlModel.predict(X_train).compute() if input_type == "dataframe": X = cp.array(X_train.compute().as_gpu_matrix()) predictions = cp.array(predictions) centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix()) elif input_type == "array": X = X_train.compute() centers = cumlModel.cluster_centers_ expected_score = 0 for idx, label in enumerate(predictions): x = X[idx] y = centers[label] dist = cp.sqrt(cp.sum((x - y)**2)) expected_score += dist**2 assert actual_score + SCORE_EPS \ >= (-1 * expected_score) \ >= actual_score - SCORE_EPS finally: client.close()