def test_dbscan_sklearn_comparison(name, nrows): default_base = {'quantile': .3, 'eps': .5, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2} n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5) cu_y_pred, cu_n_clusters = fit_predict(cuml_dbscan, 'cuml_DBSCAN', X) if nrows < 500000: dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_y_pred, sk_n_clusters = fit_predict(dbscan, 'sk_DBSCAN', X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert(score == 1.0)
def test_rand_index_score(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X) cu_score = cu_ars(y, cu_y_pred) cu_score_using_sk = sk_ars(y, cu_y_pred) assert array_equal(cu_score, cu_score_using_sk)
def test_kmeans_sklearn_comparison_default(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'], random_state=12, n_init=10, output_type='numpy') X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred = cuml_kmeans.fit_predict(X) cu_score = adjusted_rand_score(cu_y_pred, y) kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) sk_score = adjusted_rand_score(sk_y_pred, y) assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2
def test_dbscan_default(name): default_base = { 'quantile': .3, 'eps': .5, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = 500 pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, params['eps'])
def test_hdbscan_cluster_patterns_extract_clusters( dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) cuml_agg._extract_clusters(sk_agg.condensed_tree_) assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0 assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test), sk_agg.probabilities_)
def test_dbscan_default(name): default_base = { 'quantile': .3, 'eps': .5, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = 500 pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(output_type='numpy') cu_y_pred = cuml_dbscan.fit_predict(X) dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert (score == 1.0)
def test_dbscan_sklearn_comparison(name, nrows, eps): default_base = { 'quantile': .2, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5, output_type='numpy') cu_y_pred = cuml_dbscan.fit_predict(X) if nrows < 500000: dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred, cu_y_pred) assert (score == 1.0) # Check the core points are equal array_equal(cuml_dbscan.core_sample_indices_, dbscan.core_sample_indices_)
def test_dbscan_no_calc_core_point_indices(): params = {'eps': 1.1, 'min_samples': 4} n_samples = 1000 pat = get_pattern("noisy_moons", n_samples) X, y = pat[0] X = StandardScaler().fit_transform(X) # Set calc_core_sample_indices=False cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5, output_type='numpy', calc_core_sample_indices=False) cu_y_pred = cuml_dbscan.fit_predict(X) dbscan = skDBSCAN(**params) sk_y_pred = dbscan.fit_predict(X) score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1]) assert (score == 1.0) # Make sure we are None assert (cuml_dbscan.core_sample_indices_ is None)
def test_kmeans_sklearn_comparison_default(name, nrows): default_base = {'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3} pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred = cuml_kmeans.fit_predict(X) cu_score = adjusted_rand_score(cu_y_pred, y) kmeans = cluster.KMeans(random_state=12, n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) sk_score = adjusted_rand_score(sk_y_pred, y) # cuML score should be in a close neighborhood around scikit-learn's assert sk_score - 0.03 <= cu_score <= sk_score + 0.03
def test_kmeans_sklearn_comparison(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred = cuml_kmeans.fit_predict(X).to_array() if nrows < 500000: kmeans = cluster.KMeans(n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3 if name == 'noisy_circles': assert (calculation < 4e-3) and score_test else: if name == 'aniso': # aniso dataset border points tend to differ in the frontier # between clusters when compared to sklearn tol = 2e-2 else: # We allow up to 5 points to be different for the other # datasets to be robust to small behavior changes # between library versions/ small changes. Visually it is # very clear that the algorithm work. Will add option # to plot if desired in a future version. tol = 1e-2 assert (clusters_equal( sk_y_pred, cu_y_pred, params['n_clusters'], tol=tol)) and score_test
def test_dbscan_no_calc_core_point_indices(client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN params = {'eps': 1.1, 'min_samples': 4} n_samples = 1000 pat = get_pattern("noisy_moons", n_samples) X, y = pat[0] X = StandardScaler().fit_transform(X) # Set calc_core_sample_indices=False cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5, output_type='numpy', calc_core_sample_indices=False) cuml_dbscan.fit_predict(X) # Make sure we are None assert(cuml_dbscan.core_sample_indices_ is None)
def test_kmeans_sklearn_comparison(name): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, 10000) params = default_base.copy() params.update(pat[1]) kmeans = cluster.KMeans(n_clusters=params['n_clusters']) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) clustering_algorithms = ( ('sk_Kmeans', kmeans), ('cuml_Kmeans', cuml_kmeans), ) sk_y_pred, _ = fit_predict(clustering_algorithms[0][1], clustering_algorithms[0][0], X) cu_y_pred, _ = fit_predict(clustering_algorithms[1][1], clustering_algorithms[1][0], X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. if name == 'noisy_circles': assert (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) < 2e-3 else: assert clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
def test_kmeans_sklearn_comparison(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X) if nrows < 500000: kmeans = cluster.KMeans(n_clusters=params['n_clusters']) sk_y_pred, _ = fit_predict(kmeans, 'sk_Kmeans', X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) print(cuml_kmeans.score(X), kmeans.score(X)) score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3 if name == 'noisy_circles': assert (calculation < 2e-3) and score_test else: assert (clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])) and score_test
def test_dbscan_sklearn_comparison(name, nrows, eps, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN default_base = { 'quantile': .2, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) if nrows < 500000: sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_labels = sk_dbscan.fit_predict(X) assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan_sklearn_comparison(name, use_handle): # Skipping datasets of known discrepancies in PR83 while they are corrected default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, 1500) params = default_base.copy() params.update(pat[1]) dbscan = skDBSCAN(eps=params['eps'], min_samples=5) handle, stream = get_handle(use_handle) cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5) X, y = pat[0] X = StandardScaler().fit_transform(X) clustering_algorithms = (('sk_DBSCAN', dbscan), ('cuml_DBSCAN', cuml_dbscan)) sk_y_pred, sk_n_clusters = fit_predict(clustering_algorithms[0][1], clustering_algorithms[0][0], X) cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1], clustering_algorithms[1][0], X) cuml_dbscan.handle.sync() assert (sk_n_clusters == cu_n_clusters) clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
def test_dbscan_sklearn_comparison(name, nrows, eps): if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") default_base = { 'quantile': .2, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) if nrows < 500000: sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)