def test_predict_non_gaussian(n_samples, n_features, n_neighbors, n_query): np.random.seed(123) X_host_train = pd.DataFrame( np.random.uniform(0, 1, (n_samples, n_features))) y_host_train = pd.DataFrame(np.random.randint(0, 5, (n_samples, 1))) X_host_test = pd.DataFrame(np.random.uniform(0, 1, (n_query, n_features))) X_device_train = cudf.DataFrame.from_pandas(X_host_train) y_device_train = cudf.DataFrame.from_pandas(y_host_train) X_device_test = cudf.DataFrame.from_pandas(X_host_test) knn_sk = skKNN(algorithm="brute", n_neighbors=n_neighbors, n_jobs=1) knn_sk.fit(X_host_train, y_host_train) sk_result = knn_sk.predict(X_host_test) knn_cuml = cuKNN(n_neighbors=n_neighbors) knn_cuml.fit(X_device_train, y_device_train) cuml_result = knn_cuml.predict(X_device_test) assert np.array_equal(np.asarray(cuml_result.to_gpu_array()), sk_result)
def test_knn_x_none(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing knn_sk.fit(X.get()) D_sk, I_sk = knn_sk.kneighbors(X=None, n_neighbors=k) X_orig = X if input_type == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(metric=metric, p=p, output_type="numpy") knn_cu.fit(X) D_cuml, I_cuml = knn_cu.kneighbors(X=None, n_neighbors=k) # Assert the cuml model was properly reverted cp.testing.assert_allclose(knn_cu.X_m, X_orig, atol=1e-5, rtol=1e-4) # Allow a max relative diff of 10% and absolute diff of 1% cp.testing.assert_allclose(D_cuml, D_sk, atol=5e-2, rtol=1e-1) assert I_cuml.all() == I_sk.all()
def test_predict_non_gaussian(n_samples, n_features, n_neighbors, n_query): np.random.seed(123) X_host_train = pd.DataFrame( np.random.uniform(0, 1, (n_samples, n_features))) y_host_train = pd.DataFrame(np.random.randint(0, 5, (n_samples, 1))) X_host_test = pd.DataFrame(np.random.uniform(0, 1, (n_query, n_features))) X_device_train = cudf.DataFrame.from_pandas(X_host_train) y_device_train = cudf.DataFrame.from_pandas(y_host_train) X_device_test = cudf.DataFrame.from_pandas(X_host_test) knn_sk = skKNN(algorithm="brute", n_neighbors=n_neighbors, n_jobs=1) knn_sk.fit(X_host_train, y_host_train.values.ravel()) sk_result = knn_sk.predict(X_host_test) knn_cuml = cuKNN(n_neighbors=n_neighbors) knn_cuml.fit(X_device_train, y_device_train) with cuml.using_output_type("numpy"): cuml_result = knn_cuml.predict(X_device_test) assert np.array_equal(cuml_result, sk_result)
def test_cuml_against_sklearn(input_type, nrows, n_feats, k): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) knn_sk = skKNN(metric="euclidean") knn_sk.fit(X) D_sk, I_sk = knn_sk.kneighbors(X, k) if input_type == "dataframe": X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X)) knn_cu = cuKNN() knn_cu.fit(X) D_cuml, I_cuml = knn_cu.kneighbors(X, k) if input_type == "dataframe": assert isinstance(D_cuml, cudf.DataFrame) assert isinstance(I_cuml, cudf.DataFrame) D_cuml_arr = D_cuml.as_gpu_matrix().copy_to_host() I_cuml_arr = I_cuml.as_gpu_matrix().copy_to_host() else: assert isinstance(D_cuml, np.ndarray) assert isinstance(I_cuml, np.ndarray) D_cuml_arr = D_cuml I_cuml_arr = I_cuml assert array_equal(D_cuml_arr, D_sk, 1e-2, with_sign=True) assert I_cuml_arr.all() == I_sk.all()
def test_tsne_knn_graph_used(name, type_knn_graph): datasets X = eval("datasets.load_{}".format(name))().data neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \ else cuKNN(n_neighbors=90) neigh.fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance") tsne = TSNE() # Perform tsne with normal knn_graph Y = tsne.fit_transform(X, True, knn_graph) trust_normal = trustworthiness(X, Y) print("Trust = ", trust_normal) X_garbage = np.ones(X.shape) knn_graph_garbage = neigh.kneighbors_graph(X_garbage, mode="distance") # Perform tsne with garbage knn_graph Y = tsne.fit_transform(X, True, knn_graph_garbage) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage.tocoo()) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15 Y = tsne.fit_transform(X, True, knn_graph_garbage.tocsc()) trust_garbage = trustworthiness(X, Y) print("Trust = ", trust_garbage) assert (trust_normal - trust_garbage) > 0.15
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) X_index = X[:100] X_search = X[101:] p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing knn_sk.fit(X_index.get()) D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k) X_orig = X_index if input_type == "dataframe": X_index = cudf.DataFrame(X_index) X_search = cudf.DataFrame(X_search) knn_cu = cuKNN(metric=metric, p=p) knn_cu.fit(X_index) D_cuml, I_cuml = knn_cu.kneighbors(X_search, k) if input_type == "dataframe": assert isinstance(D_cuml, cudf.DataFrame) assert isinstance(I_cuml, cudf.DataFrame) D_cuml_np = D_cuml.to_numpy() I_cuml_np = I_cuml.to_numpy() else: assert isinstance(D_cuml, cp.ndarray) assert isinstance(I_cuml, cp.ndarray) D_cuml_np = D_cuml.get() I_cuml_np = I_cuml.get() with cuml.using_output_type("numpy"): # Assert the cuml model was properly reverted np.testing.assert_allclose(knn_cu.X_m, X_orig.get(), atol=1e-3, rtol=1e-3) if metric == 'braycurtis': diff = D_cuml_np - D_sk # Braycurtis has a few differences, but this is computed by FAISS. # So long as the indices all match below, the small discrepancy # should be okay. assert len(diff[diff > 1e-2]) / X_search.shape[0] < 0.06 else: np.testing.assert_allclose(D_cuml_np, D_sk, atol=1e-3, rtol=1e-3) assert I_cuml_np.all() == I_sk.all()
def test_nearest_neighbors_sparse(shape, metric, n_neighbors, batch_size_index, batch_size_query): nrows, ncols, density = shape if nrows == 1 and n_neighbors > 1: return a = cp.sparse.random(nrows, ncols, format='csr', density=density, random_state=35) b = cp.sparse.random(nrows, ncols, format='csr', density=density, random_state=38) if metric == 'jaccard': a = a.astype('bool').astype('float32') b = b.astype('bool').astype('float32') logger.set_level(logger.level_debug) nn = cuKNN(metric=metric, p=2.0, n_neighbors=n_neighbors, algorithm="brute", output_type="numpy", verbose=logger.level_debug, algo_params={"batch_size_index": batch_size_index, "batch_size_query": batch_size_query}) nn.fit(a) cuD, cuI = nn.kneighbors(b) if metric not in sklearn.neighbors.VALID_METRICS_SPARSE['brute']: a = a.todense() b = b.todense() sknn = skKNN(metric=metric, p=2.0, n_neighbors=n_neighbors, algorithm="brute", n_jobs=-1) sk_X = a.get() sknn.fit(sk_X) skD, skI = sknn.kneighbors(b.get()) cp.testing.assert_allclose(cuD, skD, atol=1e-3, rtol=1e-3) # Jaccard & Chebyshev have a high potential for mismatched indices # due to duplicate distances. We can ignore the indices in this case. if metric not in ['jaccard', 'chebyshev']: cp.testing.assert_allclose(cuI, skI, atol=1e-4, rtol=1e-4)
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type): datasets digits = datasets.load_digits() neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \ else cuKNN(n_neighbors=90) digits_selection = np.random.RandomState(42).choice( [True, False], 1797, replace=True, p=[0.60, 0.40]) selected_digits = digits.data[~digits_selection] neigh.fit(selected_digits) knn_graph = neigh.kneighbors_graph(selected_digits, mode="distance") if input_type == 'cupy': sp_prefix = cupyx.scipy.sparse else: sp_prefix = scipy.sparse tsne = TSNE(2, n_neighbors=15, random_state=1, learning_rate=500, angle=0.8) new_data = sp_prefix.csr_matrix( scipy.sparse.csr_matrix(selected_digits)) Y = tsne.fit_transform(new_data, True, knn_graph) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocoo()) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) Y = tsne.fit_transform(new_data, True, knn_graph.tocsc()) if input_type == 'cupy': Y = Y.get() check_embedding(selected_digits, Y, 0.85) del Y
def test_ann_distances_metrics(algo, metric): X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0) cu_knn = cuKNN(algorithm=algo, metric=metric) cu_knn.fit(X) cu_dist, cu_ind = cu_knn.kneighbors(X, n_neighbors=10, return_distance=True) del cu_knn gc.collect() X = X.get() sk_knn = skKNN(metric=metric) sk_knn.fit(X) sk_dist, sk_ind = sk_knn.kneighbors(X, n_neighbors=10, return_distance=True) return array_equal(sk_dist, cu_dist)
def test_knn_graph(input_type, mode, output_type, as_instance, nrows, n_feats, p, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) if as_instance: sparse_sk = sklearn.neighbors.kneighbors_graph(X.get(), k, mode=mode, metric=metric, p=p, include_self='auto') else: knn_sk = skKNN(metric=metric, p=p) knn_sk.fit(X.get()) sparse_sk = knn_sk.kneighbors_graph(X.get(), k, mode=mode) if input_type == "dataframe": X = cudf.DataFrame(X) with cuml.using_output_type(output_type): if as_instance: sparse_cu = cuml.neighbors.kneighbors_graph(X, k, mode=mode, metric=metric, p=p, include_self='auto') else: knn_cu = cuKNN(metric=metric, p=p) knn_cu.fit(X) sparse_cu = knn_cu.kneighbors_graph(X, k, mode=mode) assert np.array_equal(sparse_sk.data.shape, sparse_cu.data.shape) assert np.array_equal(sparse_sk.indices.shape, sparse_cu.indices.shape) assert np.array_equal(sparse_sk.indptr.shape, sparse_cu.indptr.shape) assert np.array_equal(sparse_sk.toarray().shape, sparse_cu.toarray().shape) if output_type == 'cupy' or output_type is None: assert cupyx.scipy.sparse.isspmatrix_csr(sparse_cu) else: assert isspmatrix_csr(sparse_cu)
def test_knn_search(input_type, should_downcast): dtype = np.float32 if not should_downcast else np.float64 X = np.array( [[1.0], [50.0], [51.0]], dtype=dtype) # For now, FAISS only seems to support single precision knn_sk = skKNN(X, metric="l2") D_sk, I_sk = knn_sk.query(X, len(X)) knn_cu = cuKNN(should_downcast=should_downcast) if input_type == 'dataframe': X = cudf.DataFrame.from_pandas(pd.DataFrame(X)) knn_cu.fit(X) D_cuml, I_cuml = knn_cu.query(X, len(X)) assert type(D_cuml) == cudf.DataFrame assert type(I_cuml) == cudf.DataFrame # FAISS does not perform sqrt on L2 because it's expensive D_cuml_arr = np.asarray(D_cuml.as_gpu_matrix(order="C")) I_cuml_arr = np.asarray(I_cuml.as_gpu_matrix(order="C")) else: knn_cu.fit(X) D_cuml, I_cuml = knn_cu.query(X, len(X)) assert type(D_cuml) == np.ndarray assert type(I_cuml) == np.ndarray D_cuml_arr = D_cuml I_cuml_arr = I_cuml print(str(D_cuml_arr)) print(str(D_cuml_arr)) print(str(I_cuml_arr)) assert np.array_equal(D_cuml_arr, np.square(D_sk)) assert np.array_equal(I_cuml_arr, I_sk)
def test_knn(input_type, should_downcast, nrows, n_feats, k): n_samples = nrows X, y = make_blobs(n_samples=n_samples, n_features=n_feats, random_state=0) knn_cu = cuKNN(should_downcast=should_downcast) if input_type == 'dataframe': X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])}) X_cudf = cudf.DataFrame.from_pandas(X_pd) knn_cu.fit(X_cudf) D_cuml, I_cuml = knn_cu.kneighbors(X_cudf, k) assert type(D_cuml) == cudf.DataFrame assert type(I_cuml) == cudf.DataFrame # FAISS does not perform sqrt on L2 because it's expensive D_cuml_arr = np.asarray(D_cuml.as_gpu_matrix(order="C")) I_cuml_arr = np.asarray(I_cuml.as_gpu_matrix(order="C")) elif input_type == 'ndarray': knn_cu.fit(X) D_cuml, I_cuml = knn_cu.kneighbors(X, k) assert type(D_cuml) == np.ndarray assert type(I_cuml) == np.ndarray D_cuml_arr = D_cuml I_cuml_arr = I_cuml if nrows < 500000: knn_sk = skKNN(metric="l2") knn_sk.fit(X) D_sk, I_sk = knn_sk.kneighbors(X, k) assert array_equal(D_cuml_arr, np.square(D_sk), 1e-2, with_sign=True) assert I_cuml_arr.all() == I_sk.all()
def test_tsne_knn_parameters(name, type_knn_graph): datasets X = eval("datasets.load_{}".format(name))().data neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \ else cuKNN(n_neighbors=90) neigh.fit(X) knn_graph = neigh.kneighbors_graph(X, mode="distance") for i in range(3): print("iteration = ", i) tsne = TSNE() Y = tsne.fit_transform(X, True, knn_graph) check_embedding(X, Y) Y = tsne.fit_transform(X, True, knn_graph.tocoo()) check_embedding(X, Y) Y = tsne.fit_transform(X, True, knn_graph.tocsc()) check_embedding(X, Y) del Y
def test_knn(input_type, nrows, n_feats, k, metric): X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0) p = 5 # Testing 5-norm of the minkowski metric only knn_sk = skKNN(metric=metric, p=p) # Testing knn_sk.fit(X) D_sk, I_sk = knn_sk.kneighbors(X, k) X_orig = X if input_type == "dataframe": X = cudf.DataFrame(X) knn_cu = cuKNN(metric=metric, p=p) knn_cu.fit(X) D_cuml, I_cuml = knn_cu.kneighbors(X, k) if input_type == "dataframe": assert isinstance(D_cuml, cudf.DataFrame) assert isinstance(I_cuml, cudf.DataFrame) D_cuml_arr = D_cuml.as_gpu_matrix().copy_to_host() I_cuml_arr = I_cuml.as_gpu_matrix().copy_to_host() else: assert isinstance(D_cuml, np.ndarray) assert isinstance(I_cuml, np.ndarray) D_cuml_arr = D_cuml I_cuml_arr = I_cuml # Assert the cuml model was properly reverted np.testing.assert_allclose(knn_cu._X_m.to_output("numpy"), X_orig, atol=1e-5, rtol=1e-4) # Allow a max relative diff of 10% and absolute diff of 1% np.testing.assert_allclose(D_cuml_arr, D_sk, atol=1e-2, rtol=1e-1) assert I_cuml_arr.all() == I_sk.all()
def test_nearest_neighbors_sparse(nrows, ncols, density, metric, n_neighbors, batch_size_index, batch_size_query): if nrows == 1 and n_neighbors > 1: return a = cp.sparse.random(nrows, ncols, format='csr', density=density, random_state=32) logger.set_level(logger.level_info) nn = cuKNN(metric=metric, n_neighbors=n_neighbors, algorithm="brute", verbose=logger.level_debug, algo_params={ "batch_size_index": batch_size_index, "batch_size_query": batch_size_query }) nn.fit(a) cuD, cuI = nn.kneighbors(a) sknn = skKNN(metric=metric, n_neighbors=n_neighbors, algorithm="brute", n_jobs=-1) sk_X = a.get() sknn.fit(sk_X) skD, skI = sknn.kneighbors(sk_X) cp.testing.assert_allclose(cuI, skI, atol=1e-4, rtol=1e-4) cp.testing.assert_allclose(cuD, skD, atol=1e-3, rtol=1e-3)
def test_nearest_neighbors_sparse(metric, nrows, ncols, density, n_neighbors, batch_size_index, batch_size_query): if nrows == 1 and n_neighbors > 1: return a = cupyx.scipy.sparse.random(nrows, ncols, format='csr', density=density, random_state=35) b = cupyx.scipy.sparse.random(nrows, ncols, format='csr', density=density, random_state=38) if metric == 'jaccard': a = a.astype('bool').astype('float32') b = b.astype('bool').astype('float32') logger.set_level(logger.level_debug) nn = cuKNN(metric=metric, p=2.0, n_neighbors=n_neighbors, algorithm="brute", output_type="numpy", verbose=logger.level_debug, algo_params={ "batch_size_index": batch_size_index, "batch_size_query": batch_size_query }) nn.fit(a) cuD, cuI = nn.kneighbors(b) if metric not in sklearn.neighbors.VALID_METRICS_SPARSE['brute']: a = a.todense() b = b.todense() sknn = skKNN(metric=metric, p=2.0, n_neighbors=n_neighbors, algorithm="brute", n_jobs=-1) sk_X = a.get() sknn.fit(sk_X) skD, skI = sknn.kneighbors(b.get()) # For some reason, this will occasionally fail w/ a single # mismatched element in CI. Allowing the single mismatch for now. cp.testing.assert_allclose(cuD, skD, atol=1e-5, rtol=1e-5) # Jaccard & Chebyshev have a high potential for mismatched indices # due to duplicate distances. We can ignore the indices in this case. if metric not in ['jaccard', 'chebyshev']: # The actual neighbors returned in the presence of duplicate distances # is non-deterministic. If we got to this point, the distances all # match between cuml and sklearn. We set a reasonable threshold # (.5% in this case) to allow differences from non-determinism. diffs = abs(cuI - skI) assert (len(diffs[diffs > 0]) / len(np.ravel(skI))) <= 0.005