Example #1
0
def test_core_point_prop1():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a latin cross or a star with a chain:
    #   .
    # . . . . .
    #   .
    # There is 1 core-point (intersection of the bars)
    # and the two points to the very right are not reachable from it
    # So there should be one cluster (the plus/star on the left)
    # and two noise points

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
Example #2
0
def test_dbscan_sklearn_comparison(name, nrows):
    default_base = {'quantile': .3,
                    'eps': .5,
                    'damping': .9,
                    'preference': -200,
                    'n_neighbors': 10,
                    'n_clusters': 2}
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5)
    cu_y_pred, cu_n_clusters = fit_predict(cuml_dbscan,
                                           'cuml_DBSCAN', X)

    if nrows < 500000:
        dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_y_pred, sk_n_clusters = fit_predict(dbscan,
                                               'sk_DBSCAN', X)

        score = adjusted_rand_score(sk_y_pred, cu_y_pred)
        assert(score == 1.0)
Example #3
0
def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch):

    # max_bytes_per_batch sizes: 10=6 batches, 200=2 batches, 2e6=1 batch

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)
    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle,
                        eps=3,
                        min_samples=2,
                        max_bytes_per_batch=max_bytes_per_batch)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
        gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
        cu_labels = cudbscan.fit_predict(gdf)
    else:
        cu_labels = cudbscan.fit_predict(X)
    cudbscan.handle.sync()

    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]
Example #4
0
def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype,
                            client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    # 2-dimensional dataset for easy distance matrix computation
    X, y = make_blobs(n_samples=nrows, cluster_std=0.01,
                      n_features=2, random_state=0)

    # Precompute distances
    Xc = np.array([[complex(p[0], p[1]) for p in X]])
    X_dist = np.abs(Xc - Xc.T, dtype=datatype)

    eps = 1
    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=2, metric='precomputed',
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X_dist, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=2, metric='precomputed',
                         algorithm="brute")
    sk_labels = sk_dbscan.fit_predict(X_dist)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Example #5
0
def test_dbscan(datatype, input_type, use_handle,
                nrows, ncols, max_mbytes_per_batch, out_dtype):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
                      n_features=n_feats, random_state=0)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2,
                        max_mbytes_per_batch=max_mbytes_per_batch)

    if input_type == 'dataframe':
        X = pd.DataFrame(
            {'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cu_labels = cudbscan.fit_predict(X_cudf, out_dtype=out_dtype)
    else:
        cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = skdbscan.fit_predict(X)
        score = adjusted_rand_score(cu_labels, sk_labels)
        assert score == 1

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Example #6
0
def test_core_point_prop2():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a long two-barred (orhodox) cross or
    # two stars next to each other:
    #   .     .
    # . . . . . .
    #   .     .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters, both in the form of a plus/star

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0],
                  [4, 1], [4, -1], [5, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
Example #7
0
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples):
    X, y = make_blobs(n_samples,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    handle, stream = get_handle(use_handle)
    eps = 0.5
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Example #8
0
def test_core_point_prop3():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a two-barred (orhodox) cross or
    # two stars sharing a link:
    #   .   .
    # . . . . .
    #   .   .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters.
    # However, the link that is shared between the stars
    # actually has an ambiguous label (to the best of my knowledge)
    # as it will depend on the order in which we process the core-points.
    # So we exclude that point from the comparison with sklearn

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1],
                  [4, -1], [5, 0], [2, 0]],
                 dtype=np.float32)
    cudbscan = cuDBSCAN(**params)
    cu_y_pred = cudbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1])
    assert (score == 1.0)
Example #9
0
def test_dbscan_default(name):
    default_base = {
        'quantile': .3,
        'eps': .5,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
Example #10
0
def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype):
    # 2-dimensional dataset for easy distance matrix computation
    X, y = make_blobs(n_samples=nrows,
                      cluster_std=0.01,
                      n_features=2,
                      random_state=0)

    # Precompute distances
    X_dist = pairwise_distances(X).astype(datatype)

    eps = 1
    cuml_dbscan = cuDBSCAN(eps=eps,
                           min_samples=2,
                           metric='precomputed',
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X_dist, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps,
                         min_samples=2,
                         metric='precomputed',
                         algorithm="brute")
    sk_labels = sk_dbscan.fit_predict(X_dist)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Example #11
0
def test_dbscan(datatype, nrows, ncols, max_mbytes_per_batch, out_dtype,
                client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    eps = 1
    cuml_dbscan = cuDBSCAN(eps=eps,
                           min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Example #12
0
def test_dbscan_default(name):
    default_base = {
        'quantile': .3,
        'eps': .5,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_y_pred = cuml_dbscan.fit_predict(X)

    dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
Example #13
0
def test_dbscan_sklearn_comparison(name, nrows, eps):
    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy')
    cu_y_pred = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_y_pred = dbscan.fit_predict(X)
        score = adjusted_rand_score(sk_y_pred, cu_y_pred)
        assert (score == 1.0)

        # Check the core points are equal
        array_equal(cuml_dbscan.core_sample_indices_,
                    dbscan.core_sample_indices_)
Example #14
0
def test_dbscan_no_calc_core_point_indices():

    params = {'eps': 1.1, 'min_samples': 4}
    n_samples = 1000
    pat = get_pattern("noisy_moons", n_samples)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    # Set calc_core_sample_indices=False
    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy',
                           calc_core_sample_indices=False)
    cu_y_pred = cuml_dbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1])
    assert (score == 1.0)

    # Make sure we are None
    assert (cuml_dbscan.core_sample_indices_ is None)
Example #15
0
def test_core_point_prop3():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a two-barred (orhodox) cross or
    # two stars sharing a link:
    #   .   .
    # . . . . .
    #   .   .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters.
    # However, the link that is shared between the stars
    # actually has an ambiguous label (to the best of my knowledge)
    # as it will depend on the order in which we process the core-points.
    # So we exclude that point from the comparison with sklearn

    # TODO: the above text does not correspond to the actual test!

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1],
                  [4, -1], [5, 0], [2, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
Example #16
0
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle,
                        eps=1,
                        min_samples=2,
                        max_mbytes_per_batch=max_mbytes_per_batch,
                        output_type='numpy')

    cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = skdbscan.fit_predict(X)
        score = adjusted_rand_score(cu_labels, sk_labels)
        assert score == 1

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Example #17
0
 def __init__(self,
              eps=0.5,
              min_samples=5,
              measure='precomputed',
              n_jobs=1):
     self.dbscan = skDBSCAN(eps=eps,
                            min_samples=min_samples,
                            metric='precomputed',
                            n_jobs=n_jobs)
     self.eps = eps
     self.min_samples = min_samples
     self.measure = measure
     self.n_jobs = n_jobs
Example #18
0
def test_dbscan_predict(datatype):
    gdf = cudf.DataFrame()
    gdf['0']=np.asarray([1,2,2,8,8,25],dtype=datatype)
    gdf['1']=np.asarray([2,2,3,7,8,80],dtype=datatype)

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype = datatype)

    print("Calling fit_predict")
    cudbscan = cuDBSCAN(eps = 3, min_samples = 2)
    cu_labels = cudbscan.fit_predict(gdf)
    skdbscan = skDBSCAN(eps = 3, min_samples = 2)
    sk_labels = skdbscan.fit_predict(X)
    print(X.shape[0])
    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]
Example #19
0
def test_dbscan_predict(datatype, input_type):

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)
    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)

    cudbscan = cuDBSCAN(eps=3, min_samples=2)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
        gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
        cu_labels = cudbscan.fit_predict(gdf)
    else:
        cu_labels = cudbscan.fit_predict(X)

    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]
Example #20
0
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    if nrows == 500000 and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test. "
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    handle, stream = get_handle(use_handle)

    eps = 1
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Example #21
0
def test_dbscan_propagation(datatype, use_handle, out_dtype):
    X, y = make_blobs(5000,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    handle, stream = get_handle(use_handle)
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=0.5,
                           min_samples=5,
                           output_type='numpy')
    cu_y_pred = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    dbscan = skDBSCAN(eps=0.5, min_samples=5)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
Example #22
0
    def fit(self, eps=0.9, min_samples=3):
        eps *= np.diff(np.histogram_bin_edges(self.x, bins='auto'))[0]
        # eps *= median_absolute_deviation(self.x)
        print(np.diff(np.histogram_bin_edges(self.x, bins='auto'))[0])
        print(median_absolute_deviation(self.x))

        db = skDBSCAN(eps,
                      min_samples,
                      metric='euclidean',
                      metric_params=None,
                      algorithm='auto',
                      leaf_size=30,
                      p=None,
                      n_jobs=None).fit(self.X)

        labels = db.labels_
        # @Note: Number of clusters in labels, ignoring noise if present.
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise = list(labels).count(-1)
        out = {"n_clusters": n_clusters, "n_noise": n_noise, "labels": labels}
        return out
Example #23
0
def test_core_point_prop1():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a latin cross or a star with a chain:
    #   .
    # . . . . .
    #   .
    # There is 1 core-point (intersection of the bars)
    # and the two points to the very right are not reachable from it
    # So there should be one cluster (the plus/star on the left)
    # and two noise points

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0]],
                 dtype=np.float32)
    cudbscan = cuDBSCAN(**params)
    cu_y_pred = cudbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
Example #24
0
def test_core_point_prop2():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a long two-barred (orhodox) cross or
    # two stars next to each other:
    #   .     .
    # . . . . . .
    #   .     .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters, both in the form of a plus/star

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0],
                  [4, 1], [4, -1], [5, 0]],
                 dtype=np.float32)
    cudbscan = cuDBSCAN(**params)
    cu_y_pred = cudbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
Example #25
0
def test_dbscan_propagation(datatype, out_dtype, client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    X, y = make_blobs(5000, centers=1, cluster_std=8.0,
                      center_box=(-100.0, 100.0), random_state=8)
    X = X.astype(datatype)

    eps = 0.5
    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Example #26
0
def test_dbscan_sklearn_comparison(name, nrows, eps, client):
    from cuml.dask.cluster.dbscan import DBSCAN as cuDBSCAN

    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }

    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_labels = sk_dbscan.fit_predict(X)

        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)
Example #27
0
def test_dbscan_sklearn_comparison(name, use_handle):
    # Skipping datasets of known discrepancies in PR83 while they are corrected
    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, 1500)

    params = default_base.copy()
    params.update(pat[1])

    dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    handle, stream = get_handle(use_handle)
    cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    clustering_algorithms = (('sk_DBSCAN', dbscan), ('cuml_DBSCAN',
                                                     cuml_dbscan))

    sk_y_pred, sk_n_clusters = fit_predict(clustering_algorithms[0][1],
                                           clustering_algorithms[0][0], X)

    cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1],
                                           clustering_algorithms[1][0], X)

    cuml_dbscan.handle.sync()

    assert (sk_n_clusters == cu_n_clusters)

    clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
Example #28
0
def test_dbscan_sklearn_comparison(name, nrows, eps):
    if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)
Example #29
0
def test_dbscan_predict_multiple_streams():
    datatype = np.float32
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
    gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)

    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)

    handle1, stream1 = get_handle(True)
    handle2, stream2 = get_handle(True)
    cudbscan1 = cuDBSCAN(handle=handle1, eps=3, min_samples=2)
    cudbscan2 = cuDBSCAN(handle=handle2, eps=3, min_samples=2)
    cu_labels1 = cudbscan1.fit_predict(gdf)
    cu_labels2 = cudbscan2.fit_predict(gdf)
    cudbscan1.handle.sync()
    cudbscan2.handle.sync()
    for i in range(X.shape[0]):
        assert cu_labels1[i] == sk_labels[i]
        assert cu_labels2[i] == sk_labels[i]
Example #30
0
def run_dbscan(X, eps, min_samples, model):
    if model == 'sklearn':
        clustering = skDBSCAN(eps=eps, min_samples=min_samples)
    elif model == 'cuml':
        from cuML import DBSCAN as cumlDBSCAN
        clustering = cumlDBSCAN(eps=eps, min_samples=min_samples)
    else:
        raise NotImplementedError

    @timer
    def fit_(clustering, X, model):
        clustering.fit(X)
        return clustering

    #@timer
    #def transform_(pca,X,model):
    #return pca.transform(X)

    clustering = fit_(clustering, X, model=model)
    print(clustering.labels_)
    #Xpca = transform_(pca,X,model=model)
    #pca.transformed_result = lambda: None
    #setattr(pca,'transformed_result',Xpca)
    return clustering