Example #1
0
def test_core_point_prop3():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a two-barred (orhodox) cross or
    # two stars sharing a link:
    #   .   .
    # . . . . .
    #   .   .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters.
    # However, the link that is shared between the stars
    # actually has an ambiguous label (to the best of my knowledge)
    # as it will depend on the order in which we process the core-points.
    # So we exclude that point from the comparison with sklearn

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1],
                  [4, -1], [5, 0], [2, 0]],
                 dtype=np.float32)
    cudbscan = cuDBSCAN(**params)
    cu_y_pred = cudbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1])
    assert (score == 1.0)
Example #2
0
def test_core_point_prop2():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a long two-barred (orhodox) cross or
    # two stars next to each other:
    #   .     .
    # . . . . . .
    #   .     .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters, both in the form of a plus/star

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0],
                  [4, 1], [4, -1], [5, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
Example #3
0
def test_dbscan_precomputed(datatype, nrows, max_mbytes_per_batch, out_dtype):
    # 2-dimensional dataset for easy distance matrix computation
    X, y = make_blobs(n_samples=nrows,
                      cluster_std=0.01,
                      n_features=2,
                      random_state=0)

    # Precompute distances
    X_dist = pairwise_distances(X).astype(datatype)

    eps = 1
    cuml_dbscan = cuDBSCAN(eps=eps,
                           min_samples=2,
                           metric='precomputed',
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X_dist, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps,
                         min_samples=2,
                         metric='precomputed',
                         algorithm="brute")
    sk_labels = sk_dbscan.fit_predict(X_dist)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Example #4
0
def test_dbscan(datatype, input_type, use_handle,
                nrows, ncols, max_mbytes_per_batch, out_dtype):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
                      n_features=n_feats, random_state=0)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2,
                        max_mbytes_per_batch=max_mbytes_per_batch)

    if input_type == 'dataframe':
        X = pd.DataFrame(
            {'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cu_labels = cudbscan.fit_predict(X_cudf, out_dtype=out_dtype)
    else:
        cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = skdbscan.fit_predict(X)
        score = adjusted_rand_score(cu_labels, sk_labels)
        assert score == 1

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Example #5
0
def test_dbscan_sklearn_comparison(name, nrows):
    default_base = {'quantile': .3,
                    'eps': .5,
                    'damping': .9,
                    'preference': -200,
                    'n_neighbors': 10,
                    'n_clusters': 2}
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'], min_samples=5)
    cu_y_pred, cu_n_clusters = fit_predict(cuml_dbscan,
                                           'cuml_DBSCAN', X)

    if nrows < 500000:
        dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_y_pred, sk_n_clusters = fit_predict(dbscan,
                                               'sk_DBSCAN', X)

        score = adjusted_rand_score(sk_y_pred, cu_y_pred)
        assert(score == 1.0)
Example #6
0
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle,
                        eps=1,
                        min_samples=2,
                        max_mbytes_per_batch=max_mbytes_per_batch,
                        output_type='numpy')

    cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = skdbscan.fit_predict(X)
        score = adjusted_rand_score(cu_labels, sk_labels)
        assert score == 1

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Example #7
0
def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch):

    # max_bytes_per_batch sizes: 10=6 batches, 200=2 batches, 2e6=1 batch

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)
    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle,
                        eps=3,
                        min_samples=2,
                        max_bytes_per_batch=max_bytes_per_batch)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
        gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
        cu_labels = cudbscan.fit_predict(gdf)
    else:
        cu_labels = cudbscan.fit_predict(X)
    cudbscan.handle.sync()

    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]
Example #8
0
def test_core_point_prop3():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a two-barred (orhodox) cross or
    # two stars sharing a link:
    #   .   .
    # . . . . .
    #   .   .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters.
    # However, the link that is shared between the stars
    # actually has an ambiguous label (to the best of my knowledge)
    # as it will depend on the order in which we process the core-points.
    # So we exclude that point from the comparison with sklearn

    # TODO: the above text does not correspond to the actual test!

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [3, 0], [4, 0], [4, 1],
                  [4, -1], [5, 0], [2, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
Example #9
0
def test_core_point_prop1():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a latin cross or a star with a chain:
    #   .
    # . . . . .
    #   .
    # There is 1 core-point (intersection of the bars)
    # and the two points to the very right are not reachable from it
    # So there should be one cluster (the plus/star on the left)
    # and two noise points

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0]],
                 dtype=np.float32)
    cuml_dbscan = cuDBSCAN(**params)
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(**params)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
Example #10
0
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples):
    X, y = make_blobs(n_samples,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    handle, stream = get_handle(use_handle)
    eps = 0.5
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Example #11
0
def test_dbscan_sklearn_comparison(name, nrows, eps):
    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy')
    cu_y_pred = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
        sk_y_pred = dbscan.fit_predict(X)
        score = adjusted_rand_score(sk_y_pred, cu_y_pred)
        assert (score == 1.0)

        # Check the core points are equal
        array_equal(cuml_dbscan.core_sample_indices_,
                    dbscan.core_sample_indices_)
Example #12
0
def test_dbscan_default(name):
    default_base = {
        'quantile': .3,
        'eps': .5,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_y_pred = cuml_dbscan.fit_predict(X)

    dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
Example #13
0
def test_dbscan_no_calc_core_point_indices():

    params = {'eps': 1.1, 'min_samples': 4}
    n_samples = 1000
    pat = get_pattern("noisy_moons", n_samples)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    # Set calc_core_sample_indices=False
    cuml_dbscan = cuDBSCAN(eps=params['eps'],
                           min_samples=5,
                           output_type='numpy',
                           calc_core_sample_indices=False)
    cu_y_pred = cuml_dbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred[:-1], cu_y_pred[:-1])
    assert (score == 1.0)

    # Make sure we are None
    assert (cuml_dbscan.core_sample_indices_ is None)
Example #14
0
def test_dbscan(datatype, use_handle, nrows, ncols,
                max_mbytes_per_batch, out_dtype):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
                      n_features=n_feats, random_state=0)

    handle, stream = get_handle(use_handle)

    eps = 1
    cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Example #15
0
def test_dbscan_default(name):
    default_base = {
        'quantile': .3,
        'eps': .5,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = 500
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    sk_dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, params['eps'])
Example #16
0
def test_dbscan_predict_multiple_streams():
    datatype = np.float32
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
    gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)

    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)

    handle1, stream1 = get_handle(True)
    handle2, stream2 = get_handle(True)
    cudbscan1 = cuDBSCAN(handle=handle1, eps=3, min_samples=2)
    cudbscan2 = cuDBSCAN(handle=handle2, eps=3, min_samples=2)
    cu_labels1 = cudbscan1.fit_predict(gdf)
    cu_labels2 = cudbscan2.fit_predict(gdf)
    cudbscan1.handle.sync()
    cudbscan2.handle.sync()
    for i in range(X.shape[0]):
        assert cu_labels1[i] == sk_labels[i]
        assert cu_labels2[i] == sk_labels[i]
Example #17
0
def test_dbscan_predict(datatype):
    gdf = cudf.DataFrame()
    gdf['0']=np.asarray([1,2,2,8,8,25],dtype=datatype)
    gdf['1']=np.asarray([2,2,3,7,8,80],dtype=datatype)

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]], dtype = datatype)

    print("Calling fit_predict")
    cudbscan = cuDBSCAN(eps = 3, min_samples = 2)
    cu_labels = cudbscan.fit_predict(gdf)
    skdbscan = skDBSCAN(eps = 3, min_samples = 2)
    sk_labels = skdbscan.fit_predict(X)
    print(X.shape[0])
    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]
Example #18
0
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    if nrows == 500000 and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test. "
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    handle, stream = get_handle(use_handle)

    eps = 1
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Example #19
0
def test_dbscan_predict(datatype, input_type):

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)
    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)

    cudbscan = cuDBSCAN(eps=3, min_samples=2)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
        gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
        cu_labels = cudbscan.fit_predict(gdf)
    else:
        cu_labels = cudbscan.fit_predict(X)

    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]
Example #20
0
def test_dbscan_propagation(datatype, use_handle, out_dtype):
    X, y = make_blobs(5000,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    handle, stream = get_handle(use_handle)
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=0.5,
                           min_samples=5,
                           output_type='numpy')
    cu_y_pred = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    dbscan = skDBSCAN(eps=0.5, min_samples=5)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
Example #21
0
def test_core_point_prop2():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a long two-barred (orhodox) cross or
    # two stars next to each other:
    #   .     .
    # . . . . . .
    #   .     .
    # There are 2 core-points but they are not reachable from each other
    # So there should be two clusters, both in the form of a plus/star

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0],
                  [4, 1], [4, -1], [5, 0]],
                 dtype=np.float32)
    cudbscan = cuDBSCAN(**params)
    cu_y_pred = cudbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
Example #22
0
def test_core_point_prop1():
    params = {'eps': 1.1, 'min_samples': 4}

    # The input looks like a latin cross or a star with a chain:
    #   .
    # . . . . .
    #   .
    # There is 1 core-point (intersection of the bars)
    # and the two points to the very right are not reachable from it
    # So there should be one cluster (the plus/star on the left)
    # and two noise points

    X = np.array([[0, 0], [1, 0], [1, 1], [1, -1], [2, 0], [3, 0], [4, 0]],
                 dtype=np.float32)
    cudbscan = cuDBSCAN(**params)
    cu_y_pred = cudbscan.fit_predict(X)

    dbscan = skDBSCAN(**params)
    sk_y_pred = dbscan.fit_predict(X)

    score = adjusted_rand_score(sk_y_pred, cu_y_pred)
    assert (score == 1.0)
Example #23
0
def test_dbscan_sklearn_comparison(name, nrows, eps):
    if nrows == 500000 and name == 'blobs' and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    default_base = {
        'quantile': .2,
        'eps': eps,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 2
    }
    n_samples = nrows
    pat = get_pattern(name, n_samples)
    params = default_base.copy()
    params.update(pat[1])
    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    cuml_dbscan = cuDBSCAN(eps=eps, min_samples=5, output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)
Example #24
0
def test_dbscan_sklearn_comparison(name, use_handle):
    # Skipping datasets of known discrepancies in PR83 while they are corrected
    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    pat = get_pattern(name, 1500)

    params = default_base.copy()
    params.update(pat[1])

    dbscan = skDBSCAN(eps=params['eps'], min_samples=5)
    handle, stream = get_handle(use_handle)
    cuml_dbscan = cuDBSCAN(handle=handle, eps=params['eps'], min_samples=5)

    X, y = pat[0]

    X = StandardScaler().fit_transform(X)

    clustering_algorithms = (('sk_DBSCAN', dbscan), ('cuml_DBSCAN',
                                                     cuml_dbscan))

    sk_y_pred, sk_n_clusters = fit_predict(clustering_algorithms[0][1],
                                           clustering_algorithms[0][0], X)

    cu_y_pred, cu_n_clusters = fit_predict(clustering_algorithms[1][1],
                                           clustering_algorithms[1][0], X)

    cuml_dbscan.handle.sync()

    assert (sk_n_clusters == cu_n_clusters)

    clusters_equal(sk_y_pred, cu_y_pred, sk_n_clusters)
Example #25
0
def test_dbscan(datatype, input_type, use_handle, nrows, ncols,
                max_bytes_per_batch):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle,
                        eps=0.5,
                        min_samples=2,
                        max_bytes_per_batch=max_bytes_per_batch)

    if input_type == 'dataframe':
        X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cu_labels = cudbscan.fit_predict(X_cudf)
    else:
        cu_labels = cudbscan.fit_predict(X)

    if nrows < 500000:
        skdbscan = skDBSCAN(eps=0.5, min_samples=2)
        sk_labels = skdbscan.fit_predict(X)
        for i in range(X.shape[0]):
            assert cu_labels[i] == sk_labels[i]
Example #26
0
def test_dbscan_out_dtype_fails_invalid_input():
    X, _ = make_blobs(n_samples=100)

    cudbscan = cuDBSCAN()
    cudbscan.fit_predict(X, out_dtype="bad_input")
Example #27
0
def test_dbscan_on_empty_array():

    X = np.array([])
    cuml_dbscan = cuDBSCAN()

    assert_raises(ValueError, cuml_dbscan.fit, X)
Example #28
0
def test_dbscan_out_dtype_fails_invalid_input():
    X, _ = make_blobs(n_samples=500)

    cuml_dbscan = cuDBSCAN(output_type='numpy')
    cuml_dbscan.fit_predict(X, out_dtype="bad_input")