def test_dbscan_sklearn_comparison(name, nrows):
    default_base = {
        'quantile': .3,
        'eps': .5,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5)
    cu_y_pred, cu_n_clusters = fit_predict(cuml_dbscan, 'cuml_DBSCAN', X)

    if nrows < 500000:
        dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_y_pred, sk_n_clusters = fit_predict(dbscan, 'sk_DBSCAN', X)
        assert (sk_n_clusters == cu_n_clusters)
        clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
Example #2
0
def test_kmeans_sklearn_comparison(name):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, 5000)

    params = default_base.copy()
    params.update(pat[1])

    kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    clustering_algorithms = (
        ('sk_Kmeans', kmeans),
        ('cuml_Kmeans', cuml_kmeans),
    )

    sk_y_pred, _ = fit_predict(clustering_algorithms[0][1],
                               clustering_algorithms[0][0], X)

    cu_y_pred, _ = fit_predict(clustering_algorithms[1][1],
                               clustering_algorithms[1][0], X)

    # Noisy circles clusters are rotated in the results,
    # since we are comparing 2 we just need to compare that both clusters
    # have approximately the same number of points.
    if name == 'noisy_circles':
        assert (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) < 1e-10

    else:
        clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])
Example #3
0
def test_kmeans_sklearn_comparison(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X).to_array()

    if nrows < 500000:
        kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
        sk_y_pred = kmeans.fit_predict(X)

        # Noisy circles clusters are rotated in the results,
        # since we are comparing 2 we just need to compare that both clusters
        # have approximately the same number of points.
        calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred)
        score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3
        if name == 'noisy_circles':
            assert (calculation < 4e-3) and score_test

        else:
            if name == 'aniso':
                # aniso dataset border points tend to differ in the frontier
                # between clusters when compared to sklearn
                tol = 2e-2
            else:
                # We allow up to 5 points to be different for the other
                # datasets to be robust to small behavior changes
                # between library versions/ small changes. Visually it is
                # very clear that the algorithm work. Will add option
                # to plot if desired in a future version.
                tol = 1e-2
            assert (clusters_equal(
                sk_y_pred, cu_y_pred, params['n_clusters'],
                tol=tol)) and score_test
Example #4
0
def test_dbscan_sklearn_comparison(name, use_handle):
    # Skipping datasets of known discrepancies in PR83 while they are corrected
    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, 1500)

    params = default_base.copy()
    params.update(pat[1])

    dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    handle, stream = get_handle(use_handle)
    cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    clustering_algorithms = (('sk_DBSCAN', dbscan), ('cuml_DBSCAN',
                                                     cuml_dbscan))

    sk_y_pred, sk_n_clusters = fit_predict(clustering_algorithms[0][1],
                                           clustering_algorithms[0][0], X)

    cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1],
                                           clustering_algorithms[1][0], X)

    cuml_dbscan.handle.sync()

    assert (sk_n_clusters == cu_n_clusters)

    clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
Example #5
0
def test_kmeans_sklearn_comparison(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X)

    if nrows < 500000:
        kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
        sk_y_pred, _ = fit_predict(kmeans, 'sk_Kmeans', X)

        # Noisy circles clusters are rotated in the results,
        # since we are comparing 2 we just need to compare that both clusters
        # have approximately the same number of points.
        calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred)
        print(cuml_kmeans.score(X), kmeans.score(X))
        score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3
        if name == 'noisy_circles':
            assert (calculation < 2e-3) and score_test

        else:
            assert (clusters_equal(sk_y_pred, cu_y_pred,
                                   params['n_clusters'])) and score_test