def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import TruncatedSVD as daskTPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=1.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) cutsvd = daskTPCA(n_components=20) cutsvd.fit_transform(X_cudf) if owns_cluster: client.close() cluster.close()
def test_pca_fit(data_info, input_type, client): nrows, ncols, n_parts = data_info if nrows == int(9e6) and pytest.max_gpu_memory < 48: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 256 ncols = ncols * pytest.max_gpu_memory // 256 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_train) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X_cpu) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.to_numpy() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit(data_info, input_type, cluster): client = Client(cluster) nrows, ncols, n_parts = data_info try: from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=0.5, random_state=10, dtype=np.float32) wait(X) if input_type == "dataframe": X_train = to_dask_cudf(X) X_cpu = X_train.compute().to_pandas().values elif input_type == "array": X_train = X X_cpu = cp.asnumpy(X_train.compute()) cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_train) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X_cpu) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] finally: client.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit(nrows, ncols, n_parts, client=None): owns_cluster = False if client is None: owns_cluster = True cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) from cuml.dask.decomposition import TruncatedSVD as daskTPCA from sklearn.decomposition import TruncatedSVD from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts, cluster_std=0.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) X = X_cudf.compute().to_pandas().values cutsvd = daskTPCA(n_components=5) cutsvd.fit(X_cudf) sktsvd = TruncatedSVD(n_components=5, algorithm="arpack") sktsvd.fit(X) all_attr = [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ] if owns_cluster: client.close() cluster.close() for attr in all_attr: with_sign = False if attr in ['components_'] else True cuml_res = (getattr(cutsvd, attr)) if type(cuml_res) == np.ndarray: cuml_res = cuml_res.as_matrix() skl_res = getattr(sktsvd, attr) if attr == 'singular_values_': assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign) else: assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
def test_pca_fit_transform_fp32(data_info, client): nrows, ncols, n_parts = data_info from cuml.dask.decomposition import TruncatedSVD as daskTPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=1.5, random_state=10, dtype=np.float32) cutsvd = daskTPCA(n_components=20) cutsvd.fit_transform(X_cudf)
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, cluster): client = Client(cluster) try: from cuml.dask.decomposition import TruncatedSVD as daskTPCA from cuml.dask.datasets import make_blobs X_cudf, _ = make_blobs(n_samples=nrows, n_features=ncols, centers=1, n_parts=n_parts, cluster_std=1.5, verbose=False, random_state=10, dtype=np.float32) wait(X_cudf) cutsvd = daskTPCA(n_components=20) cutsvd.fit_transform(X_cudf) finally: client.close()