def test_default_n_neighbors(client): n_samples = 50 n_feats = 50 k = 15 from cuml.dask.neighbors import NearestNeighbors as daskNN from cuml.neighbors.nearest_neighbors_mg import \ NearestNeighborsMG as cumlNN from sklearn.datasets import make_blobs n_samples = _scale_rows(client, n_samples) X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, 1) cumlModel = daskNN(streams_per_handle=5) cumlModel.fit(X_cudf) ret = cumlModel.kneighbors(X_cudf, return_distance=False) assert ret.shape[1] == cumlNN().n_neighbors cumlModel = daskNN(n_neighbors=k) cumlModel.fit(X_cudf) ret = cumlModel.kneighbors(X_cudf, k, return_distance=False) assert ret.shape[1] == k
def test_batch_size(nrows, ncols, n_parts, batch_size, client): n_neighbors = 10 n_clusters = 5 from cuml.dask.neighbors import NearestNeighbors as daskNN from sklearn.datasets import make_blobs nrows = _scale_rows(client, nrows) X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=n_clusters, random_state=0) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, n_parts) cumlModel = daskNN(n_neighbors=n_neighbors, batch_size=batch_size, streams_per_handle=5) cumlModel.fit(X_cudf) out_d, out_i = cumlModel.kneighbors(X_cudf) local_i = out_i.compute().to_numpy() y_hat, _ = predict(local_i, y, n_neighbors) assert array_equal(y_hat, y)
def test_return_distance(client): n_samples = 50 n_feats = 50 k = 5 from cuml.dask.neighbors import NearestNeighbors as daskNN from sklearn.datasets import make_blobs n_samples = _scale_rows(client, n_samples) X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, 1) cumlModel = daskNN(streams_per_handle=5) cumlModel.fit(X_cudf) ret = cumlModel.kneighbors(X_cudf, k, return_distance=False) assert not isinstance(ret, tuple) ret = ret.compute() assert ret.shape == (n_samples, k) ret = cumlModel.kneighbors(X_cudf, k, return_distance=True) assert isinstance(ret, tuple) assert len(ret) == 2
def test_one_query_partition(client): from cuml.dask.neighbors import NearestNeighbors as daskNN from cuml.dask.datasets import make_blobs X_train, _ = make_blobs(n_samples=4000, n_features=16, n_parts=8) X_test, _ = make_blobs(n_samples=200, n_features=16, n_parts=1) cumlModel = daskNN(n_neighbors=4) cumlModel.fit(X_train) cumlModel.kneighbors(X_test)
def test_default_n_neighbors(cluster): client = Client(cluster) n_samples = 50 n_feats = 50 k = 15 try: from cuml.dask.neighbors import NearestNeighbors as daskNN from cuml.neighbors.nearest_neighbors_mg import \ NearestNeighborsMG as cumlNN from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, 1) wait(X_cudf) cumlModel = daskNN(verbose=False, streams_per_handle=5) cumlModel.fit(X_cudf) ret = cumlModel.kneighbors(X_cudf, return_distance=False) assert ret.shape[1] == cumlNN().n_neighbors cumlModel = daskNN(verbose=False, n_neighbors=k) cumlModel.fit(X_cudf) ret = cumlModel.kneighbors(X_cudf, k, return_distance=False) assert ret.shape[1] == k finally: client.close()
def test_compare_skl(nrows, ncols, nclusters, n_parts, n_neighbors, streams_per_handle, reverse_worker_order, client): from cuml.dask.neighbors import NearestNeighbors as daskNN from sklearn.datasets import make_blobs nrows = _scale_rows(client, nrows) X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, random_state=0) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, n_parts, reverse_worker_order) from dask.distributed import wait wait(X_cudf) dist = np.array([len(v) for v in client.has_what().values()]) assert np.all(dist == dist[0]) cumlModel = daskNN(n_neighbors=n_neighbors, streams_per_handle=streams_per_handle) cumlModel.fit(X_cudf) out_d, out_i = cumlModel.kneighbors(X_cudf) local_i = np.array(out_i.compute().to_numpy(), dtype="int64") sklModel = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y) skl_y_hat = sklModel.predict(X) y_hat, _ = predict(local_i, y, n_neighbors) sk_d, sk_i = sklModel.kneighbors(X) sk_i = sk_i.astype("int64") assert array_equal(local_i[:, 0], np.arange(nrows)) diff = sk_i - local_i n_diff = len(diff[diff > 0]) perc_diff = n_diff / (nrows * n_neighbors) assert perc_diff <= 3e-3 assert array_equal(y_hat, skl_y_hat)
def test_batch_size(nrows, ncols, n_parts, batch_size, cluster): client = Client(cluster) n_neighbors = 10 n_clusters = 5 try: from cuml.dask.neighbors import NearestNeighbors as daskNN from sklearn.datasets import make_blobs nrows = _scale_rows(client, nrows) X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=n_clusters) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, n_parts) wait(X_cudf) cumlModel = daskNN(n_neighbors=n_neighbors, batch_size=batch_size, streams_per_handle=5) cumlModel.fit(X_cudf) out_d, out_i = cumlModel.kneighbors(X_cudf) local_i = np.array(out_i.compute().as_gpu_matrix()) y_hat, _ = predict(local_i, y, n_neighbors) assert array_equal(y_hat, y) finally: client.close()
def test_compare_skl(nrows, ncols, nclusters, n_parts, n_neighbors, streams_per_handle, cluster): client = Client(cluster) try: from cuml.dask.neighbors import NearestNeighbors as daskNN from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, n_parts) wait(X_cudf) cumlModel = daskNN(verbose=False, n_neighbors=n_neighbors, streams_per_handle=streams_per_handle) cumlModel.fit(X_cudf) out_d, out_i = cumlModel.kneighbors(X_cudf) local_i = np.array(out_i.compute().as_gpu_matrix()) sklModel = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y) skl_y_hat = sklModel.predict(X) y_hat, _ = predict(local_i, y, n_neighbors) assert array_equal(y_hat, skl_y_hat) finally: client.close()
def test_return_distance(cluster): client = Client(cluster) n_samples = 50 n_feats = 50 k = 5 try: from cuml.dask.neighbors import NearestNeighbors as daskNN from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) X = X.astype(np.float32) X_cudf = _prep_training_data(client, X, 1) wait(X_cudf) cumlModel = daskNN(verbose=False, streams_per_handle=5) cumlModel.fit(X_cudf) ret = cumlModel.kneighbors(X_cudf, k, return_distance=False) assert not isinstance(ret, tuple) ret = ret.compute() assert ret.shape == (n_samples, k) ret = cumlModel.kneighbors(X_cudf, k, return_distance=True) assert isinstance(ret, tuple) assert len(ret) == 2 finally: client.close()
def test_011_exception(): from cuml.dask.neighbors import NearestNeighbors as daskNN with pytest.raises(NotImplementedError): cumlModel = daskNN() # noqa: F841