def test_default_n_neighbors(client):

    n_samples = 50
    n_feats = 50
    k = 15

    from cuml.dask.neighbors import NearestNeighbors as daskNN
    from cuml.neighbors.nearest_neighbors_mg import \
        NearestNeighborsMG as cumlNN

    from sklearn.datasets import make_blobs

    n_samples = _scale_rows(client, n_samples)

    X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0)

    X = X.astype(np.float32)

    X_cudf = _prep_training_data(client, X, 1)

    cumlModel = daskNN(streams_per_handle=5)
    cumlModel.fit(X_cudf)

    ret = cumlModel.kneighbors(X_cudf, return_distance=False)

    assert ret.shape[1] == cumlNN().n_neighbors

    cumlModel = daskNN(n_neighbors=k)
    cumlModel.fit(X_cudf)

    ret = cumlModel.kneighbors(X_cudf, k, return_distance=False)

    assert ret.shape[1] == k
def test_batch_size(nrows, ncols, n_parts, batch_size, client):

    n_neighbors = 10
    n_clusters = 5
    from cuml.dask.neighbors import NearestNeighbors as daskNN

    from sklearn.datasets import make_blobs

    nrows = _scale_rows(client, nrows)

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=n_clusters,
                      random_state=0)

    X = X.astype(np.float32)

    X_cudf = _prep_training_data(client, X, n_parts)

    cumlModel = daskNN(n_neighbors=n_neighbors,
                       batch_size=batch_size,
                       streams_per_handle=5)

    cumlModel.fit(X_cudf)

    out_d, out_i = cumlModel.kneighbors(X_cudf)

    local_i = out_i.compute().to_numpy()

    y_hat, _ = predict(local_i, y, n_neighbors)

    assert array_equal(y_hat, y)
def test_return_distance(client):

    n_samples = 50
    n_feats = 50
    k = 5

    from cuml.dask.neighbors import NearestNeighbors as daskNN

    from sklearn.datasets import make_blobs

    n_samples = _scale_rows(client, n_samples)

    X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0)

    X = X.astype(np.float32)

    X_cudf = _prep_training_data(client, X, 1)

    cumlModel = daskNN(streams_per_handle=5)
    cumlModel.fit(X_cudf)

    ret = cumlModel.kneighbors(X_cudf, k, return_distance=False)
    assert not isinstance(ret, tuple)
    ret = ret.compute()
    assert ret.shape == (n_samples, k)

    ret = cumlModel.kneighbors(X_cudf, k, return_distance=True)
    assert isinstance(ret, tuple)
    assert len(ret) == 2
def test_one_query_partition(client):
    from cuml.dask.neighbors import NearestNeighbors as daskNN
    from cuml.dask.datasets import make_blobs

    X_train, _ = make_blobs(n_samples=4000, n_features=16, n_parts=8)

    X_test, _ = make_blobs(n_samples=200, n_features=16, n_parts=1)

    cumlModel = daskNN(n_neighbors=4)
    cumlModel.fit(X_train)
    cumlModel.kneighbors(X_test)
Exemple #5
0
def test_default_n_neighbors(cluster):

    client = Client(cluster)

    n_samples = 50
    n_feats = 50
    k = 15

    try:
        from cuml.dask.neighbors import NearestNeighbors as daskNN
        from cuml.neighbors.nearest_neighbors_mg import \
            NearestNeighborsMG as cumlNN

        from sklearn.datasets import make_blobs

        X, y = make_blobs(n_samples=n_samples,
                          n_features=n_feats,
                          random_state=0)

        X = X.astype(np.float32)

        X_cudf = _prep_training_data(client, X, 1)

        wait(X_cudf)

        cumlModel = daskNN(verbose=False, streams_per_handle=5)
        cumlModel.fit(X_cudf)

        ret = cumlModel.kneighbors(X_cudf, return_distance=False)

        assert ret.shape[1] == cumlNN().n_neighbors

        cumlModel = daskNN(verbose=False, n_neighbors=k)
        cumlModel.fit(X_cudf)

        ret = cumlModel.kneighbors(X_cudf, k, return_distance=False)

        assert ret.shape[1] == k

    finally:
        client.close()
def test_compare_skl(nrows, ncols, nclusters, n_parts, n_neighbors,
                     streams_per_handle, reverse_worker_order, client):

    from cuml.dask.neighbors import NearestNeighbors as daskNN

    from sklearn.datasets import make_blobs

    nrows = _scale_rows(client, nrows)

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      random_state=0)
    X = X.astype(np.float32)

    X_cudf = _prep_training_data(client, X, n_parts, reverse_worker_order)

    from dask.distributed import wait

    wait(X_cudf)

    dist = np.array([len(v) for v in client.has_what().values()])

    assert np.all(dist == dist[0])

    cumlModel = daskNN(n_neighbors=n_neighbors,
                       streams_per_handle=streams_per_handle)
    cumlModel.fit(X_cudf)

    out_d, out_i = cumlModel.kneighbors(X_cudf)

    local_i = np.array(out_i.compute().to_numpy(), dtype="int64")

    sklModel = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
    skl_y_hat = sklModel.predict(X)
    y_hat, _ = predict(local_i, y, n_neighbors)

    sk_d, sk_i = sklModel.kneighbors(X)

    sk_i = sk_i.astype("int64")

    assert array_equal(local_i[:, 0], np.arange(nrows))

    diff = sk_i - local_i
    n_diff = len(diff[diff > 0])

    perc_diff = n_diff / (nrows * n_neighbors)

    assert perc_diff <= 3e-3

    assert array_equal(y_hat, skl_y_hat)
Exemple #7
0
def test_batch_size(nrows, ncols, n_parts,
                    batch_size, cluster):

    client = Client(cluster)

    n_neighbors = 10
    n_clusters = 5

    try:
        from cuml.dask.neighbors import NearestNeighbors as daskNN

        from sklearn.datasets import make_blobs

        nrows = _scale_rows(client, nrows)

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=n_clusters)

        X = X.astype(np.float32)

        X_cudf = _prep_training_data(client, X, n_parts)

        wait(X_cudf)

        cumlModel = daskNN(n_neighbors=n_neighbors,
                           batch_size=batch_size,
                           streams_per_handle=5)

        cumlModel.fit(X_cudf)

        out_d, out_i = cumlModel.kneighbors(X_cudf)

        local_i = np.array(out_i.compute().as_gpu_matrix())

        y_hat, _ = predict(local_i, y, n_neighbors)

        assert array_equal(y_hat, y)

    finally:
        client.close()
Exemple #8
0
def test_compare_skl(nrows, ncols, nclusters, n_parts, n_neighbors,
                     streams_per_handle, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.neighbors import NearestNeighbors as daskNN

        from sklearn.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters)
        X = X.astype(np.float32)

        X_cudf = _prep_training_data(client, X, n_parts)

        wait(X_cudf)

        cumlModel = daskNN(verbose=False,
                           n_neighbors=n_neighbors,
                           streams_per_handle=streams_per_handle)
        cumlModel.fit(X_cudf)

        out_d, out_i = cumlModel.kneighbors(X_cudf)

        local_i = np.array(out_i.compute().as_gpu_matrix())

        sklModel = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
        skl_y_hat = sklModel.predict(X)

        y_hat, _ = predict(local_i, y, n_neighbors)

        assert array_equal(y_hat, skl_y_hat)

    finally:
        client.close()
Exemple #9
0
def test_return_distance(cluster):

    client = Client(cluster)

    n_samples = 50
    n_feats = 50
    k = 5

    try:
        from cuml.dask.neighbors import NearestNeighbors as daskNN

        from sklearn.datasets import make_blobs

        X, y = make_blobs(n_samples=n_samples,
                          n_features=n_feats,
                          random_state=0)

        X = X.astype(np.float32)

        X_cudf = _prep_training_data(client, X, 1)

        wait(X_cudf)

        cumlModel = daskNN(verbose=False, streams_per_handle=5)
        cumlModel.fit(X_cudf)

        ret = cumlModel.kneighbors(X_cudf, k, return_distance=False)
        assert not isinstance(ret, tuple)
        ret = ret.compute()
        assert ret.shape == (n_samples, k)

        ret = cumlModel.kneighbors(X_cudf, k, return_distance=True)
        assert isinstance(ret, tuple)
        assert len(ret) == 2

    finally:
        client.close()
Exemple #10
0
def test_011_exception():
    from cuml.dask.neighbors import NearestNeighbors as daskNN

    with pytest.raises(NotImplementedError):
        cumlModel = daskNN()  # noqa: F841