def test_dbscan(): import cudf from cuml import DBSCAN # Create and populate a GPU DataFrame gdf_float = cudf.DataFrame() gdf_float['0'] = [1.0, 2.0, 5.0] gdf_float['1'] = [4.0, 2.0, 1.0] gdf_float['2'] = [4.0, 2.0, 1.0] # Setup and fit clusters dbscan_float = DBSCAN(eps=1.0, min_samples=1) dbscan_float.fit(gdf_float) actualOutput = str(dbscan_float.labels_) expectedOutput = """0 0 1 1 2 2 dtype: int32""" assert actualOutput == expectedOutput
def cluster(gdf, eps, minSamples): # cpu clustering # clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom') # clusterer.fit(data) logging.info("cuml.SCANDB Clustering - eps=%0.3f, samples=%d", eps, minSamples) # GPU clustering result = DBSCAN(eps=eps, min_samples=minSamples, verbose=False, calc_core_sample_indices=True, output_type='cudf').fit(gdf) metric1 = result.labels_.max() metric2 = len(result.core_sample_indices_) mlflow.log_metric('labels', metric1) mlflow.log_metric('core_samples', metric2) gpu_mem(0) logging.debug("cuml.DBSCAN - dims: %d, max distance: %0.3f, min samples: %d, labels: %d, core_samples: %d", ndims, eps, minSamples, metric1, metric2)
type=float, default=10., help='Radius of neighborhood of a point') parser.add_argument('-m', '--min-samples', default=5, type=int, help='The minimum number of samples required in a ' 'neighborhood to consider a point a core point') params = bench.parse_args(parser) # Load generated data X, _, _, _ = bench.load_data(params) # Create our clustering object dbscan = DBSCAN(eps=params.eps, min_samples=params.min_samples) # Time fit time, _ = bench.measure_function_time(dbscan.fit, X, params=params) labels = dbscan.labels_ X_host = bench.convert_to_numpy(X) labels_host = bench.convert_to_numpy(labels) acc = davies_bouldin_score(X_host, labels_host) params.n_clusters = len(set(labels_host)) - (1 if -1 in labels_host else 0) bench.print_output(library='cuml', algorithm='dbscan', stages=['training'], params=params,