コード例 #1
0
ファイル: test_dbscan.py プロジェクト: trxcllnt/cuml
def test_dbscan_sklearn_comparison(name, nrows):
    default_base = {'quantile': .3,
                    'eps': .5,
                    'damping': .9,
                    'preference': -200,
                    'n_neighbors': 10,
                    'n_clusters': 2}
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5)
    cu_y_pred, cu_n_clusters = fit_predict(cuml_dbscan,
                                           'cuml_DBSCAN', X)

    if nrows < 500000:
        dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_y_pred, sk_n_clusters = fit_predict(dbscan,
                                               'sk_DBSCAN', X)

        score = adjusted_rand_score(sk_y_pred, cu_y_pred)
        assert(score == 1.0)
コード例 #2
0
def test_rand_index_score(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X)

    cu_score = cu_ars(y, cu_y_pred)
    cu_score_using_sk = sk_ars(y, cu_y_pred)

    assert array_equal(cu_score, cu_score_using_sk)
コード例 #3
0
def test_kmeans_sklearn_comparison_default(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'],
                              random_state=12,
                              n_init=10,
                              output_type='numpy')

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X)
    cu_score = adjusted_rand_score(cu_y_pred, y)
    kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters'])
    sk_y_pred = kmeans.fit_predict(X)
    sk_score = adjusted_rand_score(sk_y_pred, y)

    assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2
コード例 #4
0
ファイル: test_dbscan.py プロジェクト: daxiongshu/cuml
def test_dbscan_default(name):
    default_base = {
        'quantile': .3,
        'eps': .5,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
コード例 #5
0
def test_hdbscan_cluster_patterns_extract_clusters(
        dataset, nrows, connectivity, cluster_selection_epsilon,
        cluster_selection_method, min_cluster_size, allow_single_cluster,
        max_cluster_size, min_samples):

    # This also tests duplicate data points
    X, y = get_pattern(dataset, nrows)[0]

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    cuml_agg._extract_clusters(sk_agg.condensed_tree_)

    assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0
    assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test),
                       sk_agg.probabilities_)
コード例 #6
0
ファイル: test_dbscan.py プロジェクト: zhangjianting/cuml
def test_dbscan_default(name):
    default_base = {
        'quantile': .3,
        'eps': .5,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_y_pred = cuml_dbscan.fit_predict(X)

    dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
コード例 #7
0
def test_dbscan_sklearn_comparison(name, nrows, eps):
    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy')
    cu_y_pred = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_y_pred = dbscan.fit_predict(X)
        score = adjusted_rand_score(sk_y_pred, cu_y_pred)
        assert (score == 1.0)

        # Check the core points are equal
        array_equal(cuml_dbscan.core_sample_indices_,
                    dbscan.core_sample_indices_)
コード例 #8
0
def test_dbscan_no_calc_core_point_indices():

    params = {'eps': 1.1, 'min_samples': 4}
    n_samples = 1000
    pat = get_pattern("noisy_moons", n_samples)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    # Set calc_core_sample_indices=False
    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy',
                           calc_core_sample_indices=False)
    cu_y_pred = cuml_dbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1])
    assert (score == 1.0)

    # Make sure we are None
    assert (cuml_dbscan.core_sample_indices_ is None)
コード例 #9
0
ファイル: test_kmeans.py プロジェクト: rnyak/cuml
def test_kmeans_sklearn_comparison_default(name, nrows):

    default_base = {'quantile': .3,
                    'eps': .3,
                    'damping': .9,
                    'preference': -200,
                    'n_neighbors': 10,
                    'n_clusters': 3}

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X)
    cu_score = adjusted_rand_score(cu_y_pred, y)
    kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters'])
    sk_y_pred = kmeans.fit_predict(X)
    sk_score = adjusted_rand_score(sk_y_pred, y)

    # cuML score should be in a close neighborhood around scikit-learn's
    assert sk_score - 0.03 <= cu_score <= sk_score + 0.03
コード例 #10
0
def test_kmeans_sklearn_comparison(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred = cuml_kmeans.fit_predict(X).to_array()

    if nrows < 500000:
        kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
        sk_y_pred = kmeans.fit_predict(X)

        # Noisy circles clusters are rotated in the results,
        # since we are comparing 2 we just need to compare that both clusters
        # have approximately the same number of points.
        calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred)
        score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3
        if name == 'noisy_circles':
            assert (calculation < 4e-3) and score_test

        else:
            if name == 'aniso':
                # aniso dataset border points tend to differ in the frontier
                # between clusters when compared to sklearn
                tol = 2e-2
            else:
                # We allow up to 5 points to be different for the other
                # datasets to be robust to small behavior changes
                # between library versions/ small changes. Visually it is
                # very clear that the algorithm work. Will add option
                # to plot if desired in a future version.
                tol = 1e-2
            assert (clusters_equal(
                sk_y_pred, cu_y_pred, params['n_clusters'],
                tol=tol)) and score_test
コード例 #11
0
def test_dbscan_no_calc_core_point_indices(client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    params = {'eps': 1.1, 'min_samples': 4}
    n_samples = 1000
    pat = get_pattern("noisy_moons", n_samples)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    # Set calc_core_sample_indices=False
    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5,
                           output_type='numpy', calc_core_sample_indices=False)
    cuml_dbscan.fit_predict(X)

    # Make sure we are None
    assert(cuml_dbscan.core_sample_indices_ is None)
コード例 #12
0
ファイル: test_kmeans.py プロジェクト: ziiin/cuml
def test_kmeans_sklearn_comparison(name):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, 10000)

    params = default_base.copy()
    params.update(pat[1])

    kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    clustering_algorithms = (
        ('sk_Kmeans', kmeans),
        ('cuml_Kmeans', cuml_kmeans),
    )

    sk_y_pred, _ = fit_predict(clustering_algorithms[0][1],
                               clustering_algorithms[0][0], X)

    cu_y_pred, _ = fit_predict(clustering_algorithms[1][1],
                               clustering_algorithms[1][0], X)

    # Noisy circles clusters are rotated in the results,
    # since we are comparing 2 we just need to compare that both clusters
    # have approximately the same number of points.
    if name == 'noisy_circles':
        assert (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) < 2e-3

    else:
        assert clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])
コード例 #13
0
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity,
                                  cluster_selection_epsilon,
                                  cluster_selection_method, min_cluster_size,
                                  allow_single_cluster, max_cluster_size,
                                  min_samples):

    # This also tests duplicate data points
    X, y = get_pattern(dataset, nrows)[0]

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    cuml_agg.fit(X)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    assert_condensed_trees(sk_agg, min_cluster_size)
    assert_cluster_counts(sk_agg, cuml_agg)

    assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_)))
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95)

    assert np.allclose(np.sort(sk_agg.cluster_persistence_),
                       np.sort(cuml_agg.cluster_persistence_),
                       rtol=0.1,
                       atol=0.1)
コード例 #14
0
def test_kmeans_sklearn_comparison(name, nrows):

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, nrows)

    params = default_base.copy()
    params.update(pat[1])

    cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'])

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X)

    if nrows < 500000:
        kmeans = cluster.KMeans(n_clusters=params['n_clusters'])
        sk_y_pred, _ = fit_predict(kmeans, 'sk_Kmeans', X)

        # Noisy circles clusters are rotated in the results,
        # since we are comparing 2 we just need to compare that both clusters
        # have approximately the same number of points.
        calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred)
        print(cuml_kmeans.score(X), kmeans.score(X))
        score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3
        if name == 'noisy_circles':
            assert (calculation < 2e-3) and score_test

        else:
            assert (clusters_equal(sk_y_pred, cu_y_pred,
                                   params['n_clusters'])) and score_test
コード例 #15
0
def test_dbscan_sklearn_comparison(name, nrows, eps, client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }

    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_labels = sk_dbscan.fit_predict(X)

        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)
コード例 #16
0
def test_dbscan_sklearn_comparison(name, use_handle):
    # Skipping datasets of known discrepancies in PR83 while they are corrected
    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, 1500)

    params = default_base.copy()
    params.update(pat[1])

    dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    handle, stream = get_handle(use_handle)
    cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    clustering_algorithms = (('sk_DBSCAN', dbscan), ('cuml_DBSCAN',
                                                     cuml_dbscan))

    sk_y_pred, sk_n_clusters = fit_predict(clustering_algorithms[0][1],
                                           clustering_algorithms[0][0], X)

    cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1],
                                           clustering_algorithms[1][0], X)

    cuml_dbscan.handle.sync()

    assert (sk_n_clusters == cu_n_clusters)

    clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
コード例 #17
0
ファイル: test_dbscan.py プロジェクト: daxiongshu/cuml
def test_dbscan_sklearn_comparison(name, nrows, eps):
    if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)