Example #1
0
def test_to_dask_df(dtype, nparts, cluster):

    c = Client(cluster)

    try:

        from cuml.dask.common.dask_df_utils import to_dask_df
        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(1e3, 25, n_parts=nparts, dtype=dtype)

        X_df = to_dask_df(X)
        y_df = to_dask_df(y)

        X_df_local = X_df.compute()
        y_df_local = y_df.compute()

        X_local = X.compute()
        y_local = y.compute()

        assert X_local.shape == X_df_local.shape
        assert y_local.shape == y_df_local.shape

        assert X_local.dtypes.unique() == X_df_local.dtypes.unique()
        assert y_local.dtypes.unique() == y_df_local.dtypes.unique()

    finally:
        c.close()
Example #2
0
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster,
                    order, output):

    c = Client(cluster)
    try:
        X, y = make_blobs(nrows,
                          ncols,
                          centers=centers,
                          cluster_std=cluster_std,
                          dtype=dtype,
                          n_parts=nparts,
                          output=output,
                          order=order)

        assert X.npartitions == nparts
        assert y.npartitions == nparts

        X_local = X.compute()
        y_local = y.compute()

        assert X_local.shape == (nrows, ncols)

        if output == 'dataframe':
            assert len(y_local[0].unique()) == centers
            assert X_local.dtypes.unique() == [dtype]
            assert y_local.shape == (nrows, 1)

        elif output == 'array':
            import cupy as cp
            assert len(cp.unique(y_local)) == centers
            assert y_local.dtype == dtype
            assert y_local.shape == (nrows, )

    finally:
        c.close()
Example #3
0
def test_make_blobs(nrows, ncols, centers, cluster_std, dtype, nparts, cluster,
                    output):

    c = Client(cluster)
    try:
        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(nrows,
                          ncols,
                          centers=centers,
                          cluster_std=cluster_std,
                          dtype=dtype,
                          n_parts=nparts,
                          output=output)

        assert X.npartitions == nparts
        assert y.npartitions == nparts

        X = X.compute()
        y = y.compute()

        assert X.shape == (nrows, ncols)
        assert y.shape == (nrows, 1)

        if output == 'dataframe':
            assert len(y[0].unique()) == centers
            assert X.dtypes.unique() == [dtype]

        elif output == 'array':
            import cupy as cp
            assert len(cp.unique(y)) == centers
            assert y.dtype == dtype

    finally:
        c.close()
Example #4
0
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, client=None):

    owns_cluster = False
    if client is None:
        owns_cluster = True
        cluster = LocalCUDACluster(threads_per_worker=1)
        client = Client(cluster)

    from cuml.dask.decomposition import PCA as daskPCA
    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(nrows,
                           ncols,
                           1,
                           n_parts,
                           cluster_std=1.5,
                           verbose=False,
                           random_state=10,
                           dtype=np.float32)

    wait(X_cudf)

    cupca = daskPCA(n_components=20, whiten=True)
    cupca.fit_transform(X_cudf)

    if owns_cluster:
        client.close()
        cluster.close()
Example #5
0
def test_getattr(client):

    # Test getattr on local param
    kmeans_model = KMeans(client=client)

    # Test AttributeError
    with pytest.raises(AttributeError):
        kmeans_model.cluster_centers_

    assert kmeans_model.client is not None

    # Test getattr on local_model param with a non-distributed model

    X, y = make_blobs(n_samples=5,
                      n_features=5,
                      centers=2,
                      n_parts=2,
                      cluster_std=0.01,
                      random_state=10)

    kmeans_model.fit(X)

    assert kmeans_model.cluster_centers_ is not None
    assert isinstance(kmeans_model.cluster_centers_, cupy.ndarray)

    # Test getattr on trained distributed model

    X, y = load_text_corpus(client)

    nb_model = MultinomialNB(client=client)
    nb_model.fit(X, y)

    assert nb_model.feature_count_ is not None
    assert isinstance(nb_model.feature_count_, cupy.ndarray)
Example #6
0
def test_score(nrows, ncols, nclusters, n_parts, input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      n_parts=n_parts,
                      cluster_std=0.01,
                      shuffle=False,
                      random_state=10)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        y_train = to_dask_cudf(y)
        y = y_train
    elif input_type == "array":
        X_train, y_train = X, y

    cumlModel = cumlKMeans(init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    cumlModel.fit(X_train)

    actual_score = cumlModel.score(X_train)

    local_model = cumlModel.get_combined_model()
    expected_score = local_model.score(X_train.compute())

    assert abs(actual_score - expected_score) < 1e-3
Example #7
0
def test_make_blobs(nrows,
                    ncols,
                    centers,
                    cluster_std,
                    dtype,
                    nparts,
                    cluster):

    c = Client(cluster)
    try:
        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(nrows, ncols,
                          centers=centers,
                          cluster_std=cluster_std,
                          dtype=dtype,
                          n_parts=nparts)

        assert X.npartitions == nparts
        assert y.npartitions == nparts

        X = X.compute()
        y = y.compute()

        assert X.shape == (nrows, ncols)
        assert y.shape == (nrows, 1)

        assert len(y[0].unique()) == centers

        assert X.dtypes.unique() == [dtype]

    finally:
        c.close()
Example #8
0
def test_transform(nrows, ncols, nclusters, n_parts, input_type, cluster):

    client = None

    try:

        client = Client(cluster)

        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          shuffle=False,
                          random_state=10)
        y = y.astype('int64')

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
            labels = cp.squeeze(y_train.compute().to_pandas().values)
        elif input_type == "array":
            X_train, y_train = X, y
            labels = cp.squeeze(y_train.compute())

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)

        xformed = cumlModel.transform(X_train).compute()
        if input_type == "dataframe":
            xformed = cp.array(xformed
                               if len(xformed.shape) == 1
                               else xformed.as_gpu_matrix())

        if nclusters == 1:
            # series shape is (nrows,) not (nrows, 1) but both are valid
            # and equivalent for this test
            assert xformed.shape in [(nrows, nclusters), (nrows,)]
        else:
            assert xformed.shape == (nrows, nclusters)

        # The argmin of the transformed values should be equal to the labels
        # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1)
        xformed_labels = cp.argmin(xformed.reshape((int(nrows),
                                                    int(nclusters))), axis=1)

        assert sk_adjusted_rand_score(cp.asnumpy(labels),
                                      cp.asnumpy(xformed_labels))

    finally:
        client.close()
Example #9
0
def test_weighted_kmeans(nrows, ncols, nclusters, n_parts, client):
    cluster_std = 10000.0
    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    # Using fairly high variance between points in clusters
    wt = cp.array([0.00001 for j in range(nrows)])

    bound = nclusters * 100000

    # Open the space really large
    centers = cp.random.uniform(-bound, bound, size=(nclusters, ncols))

    X_cudf, y = make_blobs(n_samples=nrows,
                           n_features=ncols,
                           centers=centers,
                           n_parts=n_parts,
                           cluster_std=cluster_std,
                           shuffle=False,
                           verbose=False,
                           random_state=10)

    # Choose one sample from each label and increase its weight
    for i in range(nclusters):
        wt[cp.argmax(cp.array(y.compute()) == i).item()] = 5000.0

    cumlModel = cumlKMeans(verbose=0,
                           init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    chunk_parts = int(nrows / n_parts)
    sample_weights = da.from_array(wt, chunks=(chunk_parts, ))
    cumlModel.fit(X_cudf, sample_weight=sample_weights)

    X = X_cudf.compute()

    labels_ = cumlModel.predict(X_cudf).compute()
    cluster_centers_ = cumlModel.cluster_centers_

    for i in range(nrows):

        label = labels_[i]
        actual_center = cluster_centers_[label]

        diff = sum(abs(X[i] - actual_center))

        # The large weight should be the centroid
        if wt[i] > 1.0:
            assert diff < 1.0

        # Otherwise it should be pretty far away
        else:
            assert diff > 1000.0
Example #10
0
def test_transform(nrows, ncols, nclusters, n_parts, cluster):

    client = None

    try:

        client = Client(cluster)

        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=False,
                               shuffle=False,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)

        labels = np.squeeze(y.compute().to_pandas().values)

        xformed = cumlModel.transform(X_cudf).compute()

        if nclusters == 1:
            # series shape is (nrows,) not (nrows, 1) but both are valid
            # and equivalent for this test
            assert xformed.shape in [(nrows, nclusters), (nrows, )]
        else:
            assert xformed.shape == (nrows, nclusters)

        xformed = cp.array(xformed if len(xformed.shape) ==
                           1 else xformed.as_gpu_matrix())

        # The argmin of the transformed values should be equal to the labels
        # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1)
        xformed_labels = cp.argmin(xformed.reshape(
            (int(nrows), int(nclusters))),
                                   axis=1)

        assert adjusted_rand_score(labels, cp.squeeze(xformed_labels.get()))

    finally:
        client.close()
Example #11
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=False,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)
        cumlLabels = cumlModel.predict(X_cudf, delayed_predict)

        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            assert cumlLabels.npartitions == n_parts
        else:
            assert cumlLabels.npartitions == n_workers

        cumlPred = cp.array(cumlLabels.compute())

        assert cumlPred.shape[0] == nrows
        assert np.max(cumlPred) == nclusters - 1
        assert np.min(cumlPred) == 0

        labels = np.squeeze(y.compute().to_pandas().values)

        score = adjusted_rand_score(labels, cp.squeeze(cumlPred.get()))

        print(str(score))

        assert 1.0 == score

    finally:
        client.close()
Example #12
0
def test_score(nrows, ncols, nclusters, n_parts, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=False,
                               shuffle=False,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)

        actual_score = cumlModel.score(X_cudf)

        X = cp.array(X_cudf.compute().as_gpu_matrix())

        predictions = cumlModel.predict(X_cudf).compute()
        predictions = cp.array(predictions)

        centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix())

        expected_score = 0
        for idx, label in enumerate(predictions):

            x = X[idx]
            y = centers[label]

            dist = np.sqrt(np.sum((x - y)**2))
            expected_score += dist**2

        assert actual_score + SCORE_EPS \
            >= (-1 * expected_score) \
            >= actual_score - SCORE_EPS

    finally:
        client.close()
Example #13
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      n_parts=n_parts,
                      cluster_std=0.01,
                      random_state=10)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        y_train = to_dask_cudf(y)
    elif input_type == "array":
        X_train, y_train = X, y

    cumlModel = cumlKMeans(init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    cumlModel.fit(X_train)
    cumlLabels = cumlModel.predict(X_train, delayed=delayed_predict)

    n_workers = len(list(client.has_what().keys()))

    # Verifying we are grouping partitions. This should be changed soon.
    if n_parts is not None:
        parts_len = n_parts
    else:
        parts_len = n_workers

    if input_type == "dataframe":
        assert cumlLabels.npartitions == parts_len
        cumlPred = cumlLabels.compute().values
        labels = y_train.compute().values
    elif input_type == "array":
        assert len(cumlLabels.chunks[0]) == parts_len
        cumlPred = cp.array(cumlLabels.compute())
        labels = cp.squeeze(y_train.compute())

    assert cumlPred.shape[0] == nrows
    assert cp.max(cumlPred) == nclusters - 1
    assert cp.min(cumlPred) == 0

    score = adjusted_rand_score(labels, cumlPred)

    print(str(score))

    assert 1.0 == score
Example #14
0
def test_pca_fit(nrows, ncols, n_parts, input_type, cluster):

    client = Client(cluster)

    try:

        from cuml.dask.decomposition import PCA as daskPCA
        from sklearn.decomposition import PCA

        from cuml.dask.datasets import make_blobs

        X, _ = make_blobs(n_samples=nrows,
                          n_features=ncols,
                          centers=1,
                          n_parts=n_parts,
                          cluster_std=0.5,
                          random_state=10,
                          dtype=np.float32)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            X_cpu = X_train.compute().to_pandas().values
        elif input_type == "array":
            X_train = X
            X_cpu = cp.asnumpy(X_train.compute())

        try:

            cupca = daskPCA(n_components=5, whiten=True)
            cupca.fit(X_train)
        except Exception as e:
            print(str(e))

        skpca = PCA(n_components=5, whiten=True, svd_solver="full")
        skpca.fit(X_cpu)

        from cuml.test.utils import array_equal

        all_attr = [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_'
        ]

        for attr in all_attr:
            with_sign = False if attr in ['components_'] else True
            cuml_res = (getattr(cupca, attr))
            if type(cuml_res) == np.ndarray:
                cuml_res = cuml_res.as_matrix()
            skl_res = getattr(skpca, attr)
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
    finally:
        client.close()
Example #15
0
def test_pca_fit(data_info, input_type, cluster):

    client = Client(cluster)
    nrows, ncols, n_parts = data_info

    try:

        from cuml.dask.decomposition import TruncatedSVD as daskTPCA
        from sklearn.decomposition import TruncatedSVD

        from cuml.dask.datasets import make_blobs

        X, _ = make_blobs(n_samples=nrows,
                          n_features=ncols,
                          centers=1,
                          n_parts=n_parts,
                          cluster_std=0.5,
                          random_state=10,
                          dtype=np.float32)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            X_cpu = X_train.compute().to_pandas().values
        elif input_type == "array":
            X_train = X
            X_cpu = cp.asnumpy(X_train.compute())

        cutsvd = daskTPCA(n_components=5)
        cutsvd.fit(X_train)

        sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
        sktsvd.fit(X_cpu)

        all_attr = [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_'
        ]

    finally:
        client.close()

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Example #16
0
def test_pca_fit(data_info, input_type, client):

    nrows, ncols, n_parts = data_info
    if nrows == int(9e6) and pytest.max_gpu_memory < 48:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 256
            ncols = ncols * pytest.max_gpu_memory // 256
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD

    from cuml.dask.datasets import make_blobs

    X, _ = make_blobs(n_samples=nrows,
                      n_features=ncols,
                      centers=1,
                      n_parts=n_parts,
                      cluster_std=0.5,
                      random_state=10,
                      dtype=np.float32)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        X_cpu = X_train.compute().to_pandas().values
    elif input_type == "array":
        X_train = X
        X_cpu = cp.asnumpy(X_train.compute())

    cutsvd = daskTPCA(n_components=5)
    cutsvd.fit(X_train)

    sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
    sktsvd.fit(X_cpu)

    all_attr = [
        'singular_values_', 'components_', 'explained_variance_',
        'explained_variance_ratio_'
    ]

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.to_numpy()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Example #17
0
def test_pca_fit(nrows, ncols, n_parts, client=None):

    owns_cluster = False
    if client is None:
        owns_cluster = True
        cluster = LocalCUDACluster(threads_per_worker=1)
        client = Client(cluster)

    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from sklearn.decomposition import TruncatedSVD

    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(nrows,
                           ncols,
                           1,
                           n_parts,
                           cluster_std=0.5,
                           verbose=False,
                           random_state=10,
                           dtype=np.float32)

    wait(X_cudf)

    X = X_cudf.compute().to_pandas().values

    cutsvd = daskTPCA(n_components=5)
    cutsvd.fit(X_cudf)

    sktsvd = TruncatedSVD(n_components=5, algorithm="arpack")
    sktsvd.fit(X)

    all_attr = [
        'singular_values_', 'components_', 'explained_variance_',
        'explained_variance_ratio_'
    ]

    if owns_cluster:
        client.close()
        cluster.close()

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cutsvd, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(sktsvd, attr)
        if attr == 'singular_values_':
            assert array_equal(cuml_res, skl_res, 1, with_sign=with_sign)
        else:
            assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Example #18
0
def test_pca_fit(nrows, ncols, n_parts, client=None):

    owns_cluster = False
    if client is None:
        owns_cluster = True
        cluster = LocalCUDACluster(threads_per_worker=1)
        client = Client(cluster)

    from cuml.dask.decomposition import PCA as daskPCA
    from sklearn.decomposition import PCA

    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(nrows,
                           ncols,
                           1,
                           n_parts,
                           cluster_std=0.5,
                           verbose=False,
                           random_state=10,
                           dtype=np.float32)

    wait(X_cudf)

    X = X_cudf.compute().to_pandas().values

    cupca = daskPCA(n_components=5, whiten=True)
    cupca.fit(X_cudf)

    skpca = PCA(n_components=5, whiten=True, svd_solver="full")
    skpca.fit(X)

    from cuml.test.utils import array_equal

    all_attr = [
        'singular_values_', 'components_', 'explained_variance_',
        'explained_variance_ratio_'
    ]

    if owns_cluster:
        client.close()
        cluster.close()

    for attr in all_attr:
        with_sign = False if attr in ['components_'] else True
        cuml_res = (getattr(cupca, attr))
        if type(cuml_res) == np.ndarray:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Example #19
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=True,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=1,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)
        cumlLabels = cumlModel.predict(X_cudf)
        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            assert cumlLabels.npartitions == n_parts
        else:
            assert cumlLabels.npartitions == n_workers

        from sklearn.metrics import adjusted_rand_score

        cumlPred = cumlLabels.compute().to_pandas().values

        assert cumlPred.shape[0] == nrows
        assert np.max(cumlPred) == nclusters - 1
        assert np.min(cumlPred) == 0

        labels = y.compute().to_pandas().values

        score = adjusted_rand_score(labels.reshape(labels.shape[0]), cumlPred)

        assert 1.0 == score

    finally:
        client.close()
Example #20
0
def test_pca_fit_transform_fp32_noncomponents(nrows, ncols, n_parts, client):
    # Tests the case when n_components is not passed for MG scenarios
    from cuml.dask.decomposition import PCA as daskPCA
    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(n_samples=nrows,
                           n_features=ncols,
                           centers=1,
                           n_parts=n_parts,
                           cluster_std=1.5,
                           random_state=10, dtype=np.float32)

    cupca = daskPCA(whiten=False)
    res = cupca.fit_transform(X_cudf)
    res = res.compute()
    assert res.shape[0] == nrows and res.shape[1] == 20
Example #21
0
def test_pca_fit_transform_fp32(data_info, client):

    nrows, ncols, n_parts = data_info
    from cuml.dask.decomposition import TruncatedSVD as daskTPCA
    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(n_samples=nrows,
                           n_features=ncols,
                           centers=1,
                           n_parts=n_parts,
                           cluster_std=1.5,
                           random_state=10,
                           dtype=np.float32)

    cutsvd = daskTPCA(n_components=20)
    cutsvd.fit_transform(X_cudf)
Example #22
0
def test_pca_fit_transform_fp64(nrows, ncols, n_parts, client):

    from cuml.dask.decomposition import PCA as daskPCA
    from cuml.dask.datasets import make_blobs

    X_cudf, _ = make_blobs(n_samples=nrows,
                           n_features=ncols,
                           centers=1,
                           n_parts=n_parts,
                           cluster_std=1.5,
                           random_state=10, dtype=np.float64)

    cupca = daskPCA(n_components=30, whiten=False)
    res = cupca.fit_transform(X_cudf)
    res = res.compute()
    assert res.shape[0] == nrows and res.shape[1] == 30
Example #23
0
def test_pca_fit(nrows, ncols, n_parts, cluster):

    client = Client(cluster)

    try:

        from cuml.dask.decomposition import PCA as daskPCA
        from sklearn.decomposition import PCA

        from cuml.dask.datasets import make_blobs

        X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts,
                               cluster_std=0.5, verbose=False,
                               random_state=10, dtype=np.float32)

        wait(X_cudf)

        print(str(X_cudf.head(3)))

        try:

            cupca = daskPCA(n_components=5, whiten=True)
            cupca.fit(X_cudf)
        except Exception as e:
            print(str(e))

        X = X_cudf.compute().to_pandas().values

        skpca = PCA(n_components=5, whiten=True, svd_solver="full")
        skpca.fit(X)

        from cuml.test.utils import array_equal

        all_attr = ['singular_values_', 'components_',
                    'explained_variance_', 'explained_variance_ratio_']

        for attr in all_attr:
            with_sign = False if attr in ['components_'] else True
            cuml_res = (getattr(cupca, attr))
            if type(cuml_res) == np.ndarray:
                cuml_res = cuml_res.as_matrix()
            skl_res = getattr(skpca, attr)
            assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
    finally:
        client.close()
Example #24
0
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.decomposition import PCA as daskPCA
        from cuml.dask.datasets import make_blobs

        X_cudf, _ = make_blobs(nrows, ncols, 1, n_parts,
                               cluster_std=1.5, verbose=False,
                               random_state=10, dtype=np.float32)

        wait(X_cudf)

        cupca = daskPCA(n_components=20, whiten=True)
        cupca.fit_transform(X_cudf)

    finally:
        client.close()
Example #25
0
def test_transform(nrows, ncols, nclusters, n_parts, cluster):

    client = Client(cluster)

    try:

        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=True,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)

        labels = y.compute().to_pandas().values
        labels = labels.reshape(labels.shape[0])

        xformed = cumlModel.transform(X_cudf).compute()

        assert xformed.shape == (nrows, nclusters)

        # The argmin of the transformed values should be equal to the labels
        xformed_labels = np.argmin(xformed.to_pandas().to_numpy(), axis=1)

        from sklearn.metrics import adjusted_rand_score
        assert adjusted_rand_score(labels, xformed_labels)

    finally:
        client.close()
Example #26
0
def test_getattr(cluster):

    client = Client(cluster)

    try:
        # Test getattr on local param
        kmeans_model = KMeans(client=client)

        assert kmeans_model.client is not None

        # Test getattr on local_model param with a non-distributed model

        X, y = make_blobs(n_samples=5,
                          n_features=5,
                          centers=2,
                          n_parts=2,
                          cluster_std=0.01,
                          random_state=10)

        wait(X)

        kmeans_model.fit(X)

        assert kmeans_model.cluster_centers_ is not None
        assert isinstance(kmeans_model.cluster_centers_, cupy.core.ndarray)

        # Test getattr on trained distributed model

        X, y = load_text_corpus(client)

        print(str(X.compute()))

        nb_model = MultinomialNB(client=client)
        nb_model.fit(X, y)

        assert nb_model.feature_count_ is not None
        assert isinstance(nb_model.feature_count_, cupy.core.ndarray)

    finally:
        client.close()
Example #27
0
def test_pca_fit_transform_fp32(nrows, ncols, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.decomposition import TruncatedSVD as daskTPCA
        from cuml.dask.datasets import make_blobs

        X_cudf, _ = make_blobs(n_samples=nrows,
                               n_features=ncols,
                               centers=1,
                               n_parts=n_parts,
                               cluster_std=1.5, verbose=False,
                               random_state=10, dtype=np.float32)

        wait(X_cudf)

        cutsvd = daskTPCA(n_components=20)
        cutsvd.fit_transform(X_cudf)

    finally:
        client.close()
Example #28
0
def test_pca_fit_transform_fp64(nrows, ncols, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.decomposition import PCA as daskPCA
        from cuml.dask.datasets import make_blobs

        X_cudf, _ = make_blobs(n_samples=nrows,
                               n_features=ncols,
                               centers=1,
                               n_parts=n_parts,
                               cluster_std=1.5,
                               random_state=10,
                               dtype=np.float64)

        wait(X_cudf)

        cupca = daskPCA(n_components=30, whiten=False)
        cupca.fit_transform(X_cudf)

    finally:
        client.close()
Example #29
0
def test_getattr(cluster):

    client = Client(cluster)

    # Test getattr on local param
    kmeans_model = KMeans(client=client)

    assert kmeans_model.client is not None

    # Test getattr on local_model param with a non-distributed model

    X_cudf, y = make_blobs(5,
                           5,
                           2,
                           2,
                           cluster_std=0.01,
                           verbose=False,
                           random_state=10)

    wait(X_cudf)

    kmeans_model.fit(X_cudf)

    assert kmeans_model.cluster_centers_ is not None
    assert isinstance(kmeans_model.cluster_centers_, cudf.DataFrame)

    # Test getattr on trained distributed model

    X, y = load_text_corpus(client)

    print(str(X.compute()))

    nb_model = MultinomialNB(client=client)
    nb_model.fit(X, y)

    assert nb_model.feature_count_ is not None
    assert isinstance(nb_model.feature_count_, cupy.core.ndarray)
Example #30
0
def test_score(nrows, ncols, nclusters, n_parts, input_type, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          shuffle=False,
                          random_state=10)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
            y = y_train
        elif input_type == "array":
            X_train, y_train = X, y

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)

        actual_score = cumlModel.score(X_train)

        predictions = cumlModel.predict(X_train).compute()

        if input_type == "dataframe":
            X = cp.array(X_train.compute().as_gpu_matrix())
            predictions = cp.array(predictions)

            centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix())
        elif input_type == "array":
            X = X_train.compute()
            centers = cumlModel.cluster_centers_

        expected_score = 0
        for idx, label in enumerate(predictions):

            x = X[idx]
            y = centers[label]

            dist = cp.sqrt(cp.sum((x - y)**2))
            expected_score += dist**2

        assert actual_score + SCORE_EPS \
            >= (-1 * expected_score) \
            >= actual_score - SCORE_EPS

    finally:
        client.close()