def test_hdbscan_cluster_patterns_extract_clusters( dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) cuml_agg._extract_clusters(sk_agg.condensed_tree_) assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0 assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test), sk_agg.probabilities_)
def test_kmeans_sklearn_comparison_default(name, nrows, random_state): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters'], random_state=random_state, n_init=10, output_type='numpy') X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred = cuml_kmeans.fit_predict(X) cu_score = adjusted_rand_score(cu_y_pred, y) kmeans = cluster.KMeans(random_state=random_state, n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) sk_score = adjusted_rand_score(sk_y_pred, y) assert sk_score - 1e-2 <= cu_score <= sk_score + 1e-2
def test_dbscan_default(name, client): from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN eps = 0.5 default_base = { 'quantile': .3, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = 500 pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_dbscan_no_calc_core_point_indices(): params = {'eps': 1.1, 'min_samples': 4} n_samples = 1000 pat = get_pattern("noisy_moons", n_samples) X, y = pat[0] X = StandardScaler().fit_transform(X) # Set calc_core_sample_indices=False cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5, output_type='numpy', calc_core_sample_indices=False) cuml_dbscan.fit_predict(X) # Make sure we are None assert (cuml_dbscan.core_sample_indices_ is None)
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
def test_dbscan_sklearn_comparison(name, nrows, eps): if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32 else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") default_base = { 'quantile': .2, 'eps': eps, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X) if nrows < 500000: sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)