Exemple #1
0
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client=None):

    owns_cluster = False
    if client is None:
        owns_cluster = True
        cluster = LocalCUDACluster(threads_per_worker=1)
        client = Client(cluster)

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(nrows,
                           ncols,
                           1,
                           n_parts,
                           cluster_std=1.5,
                           verbose=False,
                           random_state=10,
                           dtype=np.float32)

    wait(X_cudf)

    cutsvd = daskTPCA(n_components=20)
    cutsvd.fit_transform(X_cudf)

    if owns_cluster:
        client.close()
        cluster.close()
Exemple #2
0
def test_pca_fit(data_info, input_type, client):

    nrows, ncols, n_parts = data_info
    if nrows == int(9e6) and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 256
            ncols = ncols * pytest.max_gpu_memory // 256
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD

    from cuml.dask.datasets import make_blobs

    X, _ = make_blobs(n_samples=nrows,
                      n_features=ncols,
                      centers=1,
                      n_parts=n_parts,
                      cluster_std=0.5,
                      random_state=10,
                      dtype=np.float32)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        X_cpu = X_train.compute().to_pandas().values
    elif input_type == "array":
        X_train = X
        X_cpu = cp.asnumpy(X_train.compute())

    cutsvd = daskTPCA(n_components=5)
    cutsvd.fit(X_train)

    sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
    sktsvd.fit(X_cpu)

    all_attr = [
        'singular_values_', 'components_', 'explained_variance_',
        'explained_variance_ratio_'
    ]

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.to_numpy()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Exemple #3
0
def test_pca_fit(data_info, input_type, cluster):

    client = Client(cluster)
    nrows, ncols, n_parts = data_info

    try:

        from cuml.dask.decomposition import TruncatedSVD as daskTPCA
        from sklearn.decomposition import TruncatedSVD

        from cuml.dask.datasets import make_blobs

        X, _ = make_blobs(n_samples=nrows,
                          n_features=ncols,
                          centers=1,
                          n_parts=n_parts,
                          cluster_std=0.5,
                          random_state=10,
                          dtype=np.float32)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            X_cpu = X_train.compute().to_pandas().values
        elif input_type == "array":
            X_train = X
            X_cpu = cp.asnumpy(X_train.compute())

        cutsvd = daskTPCA(n_components=5)
        cutsvd.fit(X_train)

        sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
        sktsvd.fit(X_cpu)

        all_attr = [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_'
        ]

    finally:
        client.close()

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Exemple #4
0
def test_pca_fit(nrows, ncols, n_parts, client=None):

    owns_cluster = False
    if client is None:
        owns_cluster = True
        cluster = LocalCUDACluster(threads_per_worker=1)
        client = Client(cluster)

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD

    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(nrows,
                           ncols,
                           1,
                           n_parts,
                           cluster_std=0.5,
                           verbose=False,
                           random_state=10,
                           dtype=np.float32)

    wait(X_cudf)

    X = X_cudf.compute().to_pandas().values

    cutsvd = daskTPCA(n_components=5)
    cutsvd.fit(X_cudf)

    sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
    sktsvd.fit(X)

    all_attr = [
        'singular_values_', 'components_', 'explained_variance_',
        'explained_variance_ratio_'
    ]

    if owns_cluster:
        client.close()
        cluster.close()

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Exemple #5
0
def test_pca_fit_transform_fp32(data_info, client):

    nrows, ncols, n_parts = data_info
    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(n_samples=nrows,
                           n_features=ncols,
                           centers=1,
                           n_parts=n_parts,
                           cluster_std=1.5,
                           random_state=10,
                           dtype=np.float32)

    cutsvd = daskTPCA(n_components=20)
    cutsvd.fit_transform(X_cudf)
Exemple #6
0
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.decomposition import TruncatedSVD as daskTPCA
        from cuml.dask.datasets import make_blobs

        X_cudf, _ = make_blobs(n_samples=nrows,
                               n_features=ncols,
                               centers=1,
                               n_parts=n_parts,
                               cluster_std=1.5, verbose=False,
                               random_state=10, dtype=np.float32)

        wait(X_cudf)

        cutsvd = daskTPCA(n_components=20)
        cutsvd.fit_transform(X_cudf)

    finally:
        client.close()