def test_dbscan_sklearn_comparison(name, nrows): default_base = { 'quantile': .3, 'eps': .5, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 2 } n_samples = nrows pat = get_pattern(name, n_samples) params = default_base.copy() params.update(pat[1]) X, y = pat[0] X = StandardScaler().fit_transform(X) cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5) cu_y_pred, cu_n_clusters = fit_predict(cuml_dbscan, 'cuml_DBSCAN', X) if nrows < 500000: dbscan = skDBSCAN(eps=params['eps'], min_samples=5) sk_y_pred, sk_n_clusters = fit_predict(dbscan, 'sk_DBSCAN', X) assert (sk_n_clusters == cu_n_clusters) clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
def test_kmeans_sklearn_comparison(name): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, 5000) params = default_base.copy() params.update(pat[1]) kmeans = cluster.KMeans(n_clusters=params['n_clusters']) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) clustering_algorithms = ( ('sk_Kmeans', kmeans), ('cuml_Kmeans', cuml_kmeans), ) sk_y_pred, _ = fit_predict(clustering_algorithms[0][1], clustering_algorithms[0][0], X) cu_y_pred, _ = fit_predict(clustering_algorithms[1][1], clustering_algorithms[1][0], X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. if name == 'noisy_circles': assert (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) < 1e-10 else: clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])
def test_kmeans_sklearn_comparison(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred = cuml_kmeans.fit_predict(X).to_array() if nrows < 500000: kmeans = cluster.KMeans(n_clusters=params['n_clusters']) sk_y_pred = kmeans.fit_predict(X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3 if name == 'noisy_circles': assert (calculation < 4e-3) and score_test else: if name == 'aniso': # aniso dataset border points tend to differ in the frontier # between clusters when compared to sklearn tol = 2e-2 else: # We allow up to 5 points to be different for the other # datasets to be robust to small behavior changes # between library versions/ small changes. Visually it is # very clear that the algorithm work. Will add option # to plot if desired in a future version. tol = 1e-2 assert (clusters_equal( sk_y_pred, cu_y_pred, params['n_clusters'], tol=tol)) and score_test
def test_dbscan_sklearn_comparison(name, use_handle): # Skipping datasets of known discrepancies in PR83 while they are corrected default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, 1500) params = default_base.copy() params.update(pat[1]) dbscan = skDBSCAN(eps=params['eps'], min_samples=5) handle, stream = get_handle(use_handle) cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5) X, y = pat[0] X = StandardScaler().fit_transform(X) clustering_algorithms = (('sk_DBSCAN', dbscan), ('cuml_DBSCAN', cuml_dbscan)) sk_y_pred, sk_n_clusters = fit_predict(clustering_algorithms[0][1], clustering_algorithms[0][0], X) cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1], clustering_algorithms[1][0], X) cuml_dbscan.handle.sync() assert (sk_n_clusters == cu_n_clusters) clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
def test_kmeans_sklearn_comparison(name, nrows): default_base = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3 } pat = get_pattern(name, nrows) params = default_base.copy() params.update(pat[1]) cuml_kmeans = cuml.KMeans(n_clusters=params['n_clusters']) X, y = pat[0] X = StandardScaler().fit_transform(X) cu_y_pred, _ = fit_predict(cuml_kmeans, 'cuml_Kmeans', X) if nrows < 500000: kmeans = cluster.KMeans(n_clusters=params['n_clusters']) sk_y_pred, _ = fit_predict(kmeans, 'sk_Kmeans', X) # Noisy circles clusters are rotated in the results, # since we are comparing 2 we just need to compare that both clusters # have approximately the same number of points. calculation = (np.sum(sk_y_pred) - np.sum(cu_y_pred)) / len(sk_y_pred) print(cuml_kmeans.score(X), kmeans.score(X)) score_test = (cuml_kmeans.score(X) - kmeans.score(X)) < 2e-3 if name == 'noisy_circles': assert (calculation < 2e-3) and score_test else: assert (clusters_equal(sk_y_pred, cu_y_pred, params['n_clusters'])) and score_test