コード例 #1
0
def test_predict_non_gaussian(n_samples, n_features, n_neighbors, n_query):

    np.random.seed(123)

    X_host_train = pd.DataFrame(
        np.random.uniform(0, 1, (n_samples, n_features)))
    y_host_train = pd.DataFrame(np.random.randint(0, 5, (n_samples, 1)))
    X_host_test = pd.DataFrame(np.random.uniform(0, 1, (n_query, n_features)))

    X_device_train = cudf.DataFrame.from_pandas(X_host_train)
    y_device_train = cudf.DataFrame.from_pandas(y_host_train)

    X_device_test = cudf.DataFrame.from_pandas(X_host_test)

    knn_sk = skKNN(algorithm="brute", n_neighbors=n_neighbors, n_jobs=1)
    knn_sk.fit(X_host_train, y_host_train)

    sk_result = knn_sk.predict(X_host_test)

    knn_cuml = cuKNN(n_neighbors=n_neighbors)
    knn_cuml.fit(X_device_train, y_device_train)

    cuml_result = knn_cuml.predict(X_device_test)

    assert np.array_equal(np.asarray(cuml_result.to_gpu_array()), sk_result)
コード例 #2
0
def test_knn_x_none(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows,
                      n_features=n_feats, random_state=0)

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
    knn_sk.fit(X.get())
    D_sk, I_sk = knn_sk.kneighbors(X=None, n_neighbors=k)

    X_orig = X

    if input_type == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(metric=metric, p=p, output_type="numpy")
    knn_cu.fit(X)
    D_cuml, I_cuml = knn_cu.kneighbors(X=None, n_neighbors=k)

    # Assert the cuml model was properly reverted
    cp.testing.assert_allclose(knn_cu.X_m, X_orig,
                               atol=1e-5, rtol=1e-4)

    # Allow a max relative diff of 10% and absolute diff of 1%
    cp.testing.assert_allclose(D_cuml, D_sk, atol=5e-2,
                               rtol=1e-1)
    assert I_cuml.all() == I_sk.all()
コード例 #3
0
def test_predict_non_gaussian(n_samples, n_features, n_neighbors, n_query):

    np.random.seed(123)

    X_host_train = pd.DataFrame(
        np.random.uniform(0, 1, (n_samples, n_features)))
    y_host_train = pd.DataFrame(np.random.randint(0, 5, (n_samples, 1)))
    X_host_test = pd.DataFrame(np.random.uniform(0, 1, (n_query, n_features)))

    X_device_train = cudf.DataFrame.from_pandas(X_host_train)
    y_device_train = cudf.DataFrame.from_pandas(y_host_train)

    X_device_test = cudf.DataFrame.from_pandas(X_host_test)

    knn_sk = skKNN(algorithm="brute", n_neighbors=n_neighbors, n_jobs=1)
    knn_sk.fit(X_host_train, y_host_train.values.ravel())

    sk_result = knn_sk.predict(X_host_test)

    knn_cuml = cuKNN(n_neighbors=n_neighbors)
    knn_cuml.fit(X_device_train, y_device_train)

    with cuml.using_output_type("numpy"):
        cuml_result = knn_cuml.predict(X_device_test)

        assert np.array_equal(cuml_result, sk_result)
コード例 #4
0
ファイル: test_nearest_neighbors.py プロジェクト: teju85/cuml
def test_cuml_against_sklearn(input_type, nrows, n_feats, k):
    X, _ = make_blobs(n_samples=nrows,
                      n_features=n_feats, random_state=0)

    knn_sk = skKNN(metric="euclidean")
    knn_sk.fit(X)
    D_sk, I_sk = knn_sk.kneighbors(X, k)

    if input_type == "dataframe":
        X = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X))

    knn_cu = cuKNN()
    knn_cu.fit(X)
    D_cuml, I_cuml = knn_cu.kneighbors(X, k)

    if input_type == "dataframe":
        assert isinstance(D_cuml, cudf.DataFrame)
        assert isinstance(I_cuml, cudf.DataFrame)
        D_cuml_arr = D_cuml.as_gpu_matrix().copy_to_host()
        I_cuml_arr = I_cuml.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(D_cuml, np.ndarray)
        assert isinstance(I_cuml, np.ndarray)
        D_cuml_arr = D_cuml
        I_cuml_arr = I_cuml

    assert array_equal(D_cuml_arr, D_sk, 1e-2, with_sign=True)
    assert I_cuml_arr.all() == I_sk.all()
コード例 #5
0
ファイル: test_tsne.py プロジェクト: st071300/cuML
def test_tsne_knn_graph_used(name, type_knn_graph):

    datasets
    X = eval("datasets.load_{}".format(name))().data

    neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \
        else cuKNN(n_neighbors=90)

    neigh.fit(X)
    knn_graph = neigh.kneighbors_graph(X, mode="distance")
    tsne = TSNE()

    # Perform tsne with normal knn_graph
    Y = tsne.fit_transform(X, True, knn_graph)
    trust_normal = trustworthiness(X, Y)
    print("Trust = ", trust_normal)

    X_garbage = np.ones(X.shape)
    knn_graph_garbage = neigh.kneighbors_graph(X_garbage, mode="distance")

    # Perform tsne with garbage knn_graph
    Y = tsne.fit_transform(X, True, knn_graph_garbage)
    trust_garbage = trustworthiness(X, Y)
    print("Trust = ", trust_garbage)
    assert (trust_normal - trust_garbage) > 0.15

    Y = tsne.fit_transform(X, True, knn_graph_garbage.tocoo())
    trust_garbage = trustworthiness(X, Y)
    print("Trust = ", trust_garbage)
    assert (trust_normal - trust_garbage) > 0.15

    Y = tsne.fit_transform(X, True, knn_graph_garbage.tocsc())
    trust_garbage = trustworthiness(X, Y)
    print("Trust = ", trust_garbage)
    assert (trust_normal - trust_garbage) > 0.15
コード例 #6
0
def test_knn_separate_index_search(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    X_index = X[:100]
    X_search = X[101:]

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
    knn_sk.fit(X_index.get())
    D_sk, I_sk = knn_sk.kneighbors(X_search.get(), k)

    X_orig = X_index

    if input_type == "dataframe":
        X_index = cudf.DataFrame(X_index)
        X_search = cudf.DataFrame(X_search)

    knn_cu = cuKNN(metric=metric, p=p)
    knn_cu.fit(X_index)
    D_cuml, I_cuml = knn_cu.kneighbors(X_search, k)

    if input_type == "dataframe":
        assert isinstance(D_cuml, cudf.DataFrame)
        assert isinstance(I_cuml, cudf.DataFrame)
        D_cuml_np = D_cuml.to_numpy()
        I_cuml_np = I_cuml.to_numpy()
    else:
        assert isinstance(D_cuml, cp.ndarray)
        assert isinstance(I_cuml, cp.ndarray)
        D_cuml_np = D_cuml.get()
        I_cuml_np = I_cuml.get()

    with cuml.using_output_type("numpy"):
        # Assert the cuml model was properly reverted
        np.testing.assert_allclose(knn_cu.X_m,
                                   X_orig.get(),
                                   atol=1e-3,
                                   rtol=1e-3)

    if metric == 'braycurtis':
        diff = D_cuml_np - D_sk
        # Braycurtis has a few differences, but this is computed by FAISS.
        # So long as the indices all match below, the small discrepancy
        # should be okay.
        assert len(diff[diff > 1e-2]) / X_search.shape[0] < 0.06
    else:
        np.testing.assert_allclose(D_cuml_np, D_sk, atol=1e-3, rtol=1e-3)
    assert I_cuml_np.all() == I_sk.all()
コード例 #7
0
def test_nearest_neighbors_sparse(shape,
                                  metric,
                                  n_neighbors,
                                  batch_size_index,
                                  batch_size_query):

    nrows, ncols, density = shape

    if nrows == 1 and n_neighbors > 1:
        return

    a = cp.sparse.random(nrows, ncols, format='csr', density=density,
                         random_state=35)
    b = cp.sparse.random(nrows, ncols, format='csr', density=density,
                         random_state=38)

    if metric == 'jaccard':
        a = a.astype('bool').astype('float32')
        b = b.astype('bool').astype('float32')

    logger.set_level(logger.level_debug)
    nn = cuKNN(metric=metric, p=2.0, n_neighbors=n_neighbors,
               algorithm="brute", output_type="numpy",
               verbose=logger.level_debug,
               algo_params={"batch_size_index": batch_size_index,
                            "batch_size_query": batch_size_query})
    nn.fit(a)

    cuD, cuI = nn.kneighbors(b)

    if metric not in sklearn.neighbors.VALID_METRICS_SPARSE['brute']:
        a = a.todense()
        b = b.todense()

    sknn = skKNN(metric=metric, p=2.0, n_neighbors=n_neighbors,
                 algorithm="brute", n_jobs=-1)
    sk_X = a.get()
    sknn.fit(sk_X)

    skD, skI = sknn.kneighbors(b.get())

    cp.testing.assert_allclose(cuD, skD, atol=1e-3, rtol=1e-3)

    # Jaccard & Chebyshev have a high potential for mismatched indices
    # due to duplicate distances. We can ignore the indices in this case.
    if metric not in ['jaccard', 'chebyshev']:
        cp.testing.assert_allclose(cuI, skI, atol=1e-4, rtol=1e-4)
コード例 #8
0
ファイル: test_tsne.py プロジェクト: st071300/cuML
def test_tsne_knn_parameters_sparse(type_knn_graph, input_type):

    datasets
    digits = datasets.load_digits()

    neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \
        else cuKNN(n_neighbors=90)

    digits_selection = np.random.RandomState(42).choice(
        [True, False], 1797, replace=True, p=[0.60, 0.40])

    selected_digits = digits.data[~digits_selection]

    neigh.fit(selected_digits)
    knn_graph = neigh.kneighbors_graph(selected_digits, mode="distance")

    if input_type == 'cupy':
        sp_prefix = cupyx.scipy.sparse
    else:
        sp_prefix = scipy.sparse

    tsne = TSNE(2, n_neighbors=15,
                random_state=1,
                learning_rate=500,
                angle=0.8)

    new_data = sp_prefix.csr_matrix(
        scipy.sparse.csr_matrix(selected_digits))

    Y = tsne.fit_transform(new_data, True, knn_graph)
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocoo())
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)

    Y = tsne.fit_transform(new_data, True, knn_graph.tocsc())
    if input_type == 'cupy':
        Y = Y.get()
    check_embedding(selected_digits, Y, 0.85)
    del Y
コード例 #9
0
def test_ann_distances_metrics(algo, metric):
    X, y = make_blobs(n_samples=500, centers=2, n_features=128, random_state=0)

    cu_knn = cuKNN(algorithm=algo, metric=metric)
    cu_knn.fit(X)
    cu_dist, cu_ind = cu_knn.kneighbors(X,
                                        n_neighbors=10,
                                        return_distance=True)
    del cu_knn
    gc.collect()

    X = X.get()
    sk_knn = skKNN(metric=metric)
    sk_knn.fit(X)
    sk_dist, sk_ind = sk_knn.kneighbors(X,
                                        n_neighbors=10,
                                        return_distance=True)

    return array_equal(sk_dist, cu_dist)
コード例 #10
0
def test_knn_graph(input_type, mode, output_type, as_instance, nrows, n_feats,
                   p, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    if as_instance:
        sparse_sk = sklearn.neighbors.kneighbors_graph(X.get(),
                                                       k,
                                                       mode=mode,
                                                       metric=metric,
                                                       p=p,
                                                       include_self='auto')
    else:
        knn_sk = skKNN(metric=metric, p=p)
        knn_sk.fit(X.get())
        sparse_sk = knn_sk.kneighbors_graph(X.get(), k, mode=mode)

    if input_type == "dataframe":
        X = cudf.DataFrame(X)

    with cuml.using_output_type(output_type):
        if as_instance:
            sparse_cu = cuml.neighbors.kneighbors_graph(X,
                                                        k,
                                                        mode=mode,
                                                        metric=metric,
                                                        p=p,
                                                        include_self='auto')
        else:
            knn_cu = cuKNN(metric=metric, p=p)
            knn_cu.fit(X)
            sparse_cu = knn_cu.kneighbors_graph(X, k, mode=mode)

    assert np.array_equal(sparse_sk.data.shape, sparse_cu.data.shape)
    assert np.array_equal(sparse_sk.indices.shape, sparse_cu.indices.shape)
    assert np.array_equal(sparse_sk.indptr.shape, sparse_cu.indptr.shape)
    assert np.array_equal(sparse_sk.toarray().shape, sparse_cu.toarray().shape)

    if output_type == 'cupy' or output_type is None:
        assert cupyx.scipy.sparse.isspmatrix_csr(sparse_cu)
    else:
        assert isspmatrix_csr(sparse_cu)
コード例 #11
0
def test_knn_search(input_type, should_downcast):

    dtype = np.float32 if not should_downcast else np.float64

    X = np.array(
        [[1.0], [50.0], [51.0]],
        dtype=dtype)  # For now, FAISS only seems to support single precision

    knn_sk = skKNN(X, metric="l2")
    D_sk, I_sk = knn_sk.query(X, len(X))

    knn_cu = cuKNN(should_downcast=should_downcast)
    if input_type == 'dataframe':
        X = cudf.DataFrame.from_pandas(pd.DataFrame(X))
        knn_cu.fit(X)
        D_cuml, I_cuml = knn_cu.query(X, len(X))

        assert type(D_cuml) == cudf.DataFrame
        assert type(I_cuml) == cudf.DataFrame

        # FAISS does not perform sqrt on L2 because it's expensive

        D_cuml_arr = np.asarray(D_cuml.as_gpu_matrix(order="C"))
        I_cuml_arr = np.asarray(I_cuml.as_gpu_matrix(order="C"))

    else:
        knn_cu.fit(X)
        D_cuml, I_cuml = knn_cu.query(X, len(X))

        assert type(D_cuml) == np.ndarray
        assert type(I_cuml) == np.ndarray

        D_cuml_arr = D_cuml
        I_cuml_arr = I_cuml

    print(str(D_cuml_arr))
    print(str(D_cuml_arr))
    print(str(I_cuml_arr))

    assert np.array_equal(D_cuml_arr, np.square(D_sk))
    assert np.array_equal(I_cuml_arr, I_sk)
コード例 #12
0
def test_knn(input_type, should_downcast, nrows, n_feats, k):
    n_samples = nrows
    X, y = make_blobs(n_samples=n_samples,
                      n_features=n_feats, random_state=0)

    knn_cu = cuKNN(should_downcast=should_downcast)

    if input_type == 'dataframe':
        X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X_pd)
        knn_cu.fit(X_cudf)
        D_cuml, I_cuml = knn_cu.kneighbors(X_cudf, k)

        assert type(D_cuml) == cudf.DataFrame
        assert type(I_cuml) == cudf.DataFrame

        # FAISS does not perform sqrt on L2 because it's expensive

        D_cuml_arr = np.asarray(D_cuml.as_gpu_matrix(order="C"))
        I_cuml_arr = np.asarray(I_cuml.as_gpu_matrix(order="C"))

    elif input_type == 'ndarray':

        knn_cu.fit(X)
        D_cuml, I_cuml = knn_cu.kneighbors(X, k)
        assert type(D_cuml) == np.ndarray
        assert type(I_cuml) == np.ndarray

        D_cuml_arr = D_cuml
        I_cuml_arr = I_cuml

    if nrows < 500000:
        knn_sk = skKNN(metric="l2")
        knn_sk.fit(X)
        D_sk, I_sk = knn_sk.kneighbors(X, k)

        assert array_equal(D_cuml_arr, np.square(D_sk), 1e-2, with_sign=True)
        assert I_cuml_arr.all() == I_sk.all()
コード例 #13
0
ファイル: test_tsne.py プロジェクト: st071300/cuML
def test_tsne_knn_parameters(name, type_knn_graph):

    datasets
    X = eval("datasets.load_{}".format(name))().data

    neigh = skKNN(n_neighbors=90) if type_knn_graph == 'sklearn' \
        else cuKNN(n_neighbors=90)

    neigh.fit(X)
    knn_graph = neigh.kneighbors_graph(X, mode="distance")

    for i in range(3):
        print("iteration = ", i)
        tsne = TSNE()
        Y = tsne.fit_transform(X, True, knn_graph)
        check_embedding(X, Y)

        Y = tsne.fit_transform(X, True, knn_graph.tocoo())
        check_embedding(X, Y)

        Y = tsne.fit_transform(X, True, knn_graph.tocsc())
        check_embedding(X, Y)
        del Y
コード例 #14
0
def test_knn(input_type, nrows, n_feats, k, metric):
    X, _ = make_blobs(n_samples=nrows, n_features=n_feats, random_state=0)

    p = 5  # Testing 5-norm of the minkowski metric only
    knn_sk = skKNN(metric=metric, p=p)  # Testing
    knn_sk.fit(X)
    D_sk, I_sk = knn_sk.kneighbors(X, k)

    X_orig = X

    if input_type == "dataframe":
        X = cudf.DataFrame(X)

    knn_cu = cuKNN(metric=metric, p=p)
    knn_cu.fit(X)
    D_cuml, I_cuml = knn_cu.kneighbors(X, k)

    if input_type == "dataframe":
        assert isinstance(D_cuml, cudf.DataFrame)
        assert isinstance(I_cuml, cudf.DataFrame)
        D_cuml_arr = D_cuml.as_gpu_matrix().copy_to_host()
        I_cuml_arr = I_cuml.as_gpu_matrix().copy_to_host()
    else:
        assert isinstance(D_cuml, np.ndarray)
        assert isinstance(I_cuml, np.ndarray)
        D_cuml_arr = D_cuml
        I_cuml_arr = I_cuml

    # Assert the cuml model was properly reverted
    np.testing.assert_allclose(knn_cu._X_m.to_output("numpy"),
                               X_orig,
                               atol=1e-5,
                               rtol=1e-4)

    # Allow a max relative diff of 10% and absolute diff of 1%
    np.testing.assert_allclose(D_cuml_arr, D_sk, atol=1e-2, rtol=1e-1)
    assert I_cuml_arr.all() == I_sk.all()
コード例 #15
0
def test_nearest_neighbors_sparse(nrows, ncols, density, metric, n_neighbors,
                                  batch_size_index, batch_size_query):

    if nrows == 1 and n_neighbors > 1:
        return

    a = cp.sparse.random(nrows,
                         ncols,
                         format='csr',
                         density=density,
                         random_state=32)

    logger.set_level(logger.level_info)
    nn = cuKNN(metric=metric,
               n_neighbors=n_neighbors,
               algorithm="brute",
               verbose=logger.level_debug,
               algo_params={
                   "batch_size_index": batch_size_index,
                   "batch_size_query": batch_size_query
               })
    nn.fit(a)

    cuD, cuI = nn.kneighbors(a)

    sknn = skKNN(metric=metric,
                 n_neighbors=n_neighbors,
                 algorithm="brute",
                 n_jobs=-1)
    sk_X = a.get()
    sknn.fit(sk_X)

    skD, skI = sknn.kneighbors(sk_X)

    cp.testing.assert_allclose(cuI, skI, atol=1e-4, rtol=1e-4)
    cp.testing.assert_allclose(cuD, skD, atol=1e-3, rtol=1e-3)
コード例 #16
0
def test_nearest_neighbors_sparse(metric, nrows, ncols, density, n_neighbors,
                                  batch_size_index, batch_size_query):
    if nrows == 1 and n_neighbors > 1:
        return

    a = cupyx.scipy.sparse.random(nrows,
                                  ncols,
                                  format='csr',
                                  density=density,
                                  random_state=35)
    b = cupyx.scipy.sparse.random(nrows,
                                  ncols,
                                  format='csr',
                                  density=density,
                                  random_state=38)

    if metric == 'jaccard':
        a = a.astype('bool').astype('float32')
        b = b.astype('bool').astype('float32')

    logger.set_level(logger.level_debug)
    nn = cuKNN(metric=metric,
               p=2.0,
               n_neighbors=n_neighbors,
               algorithm="brute",
               output_type="numpy",
               verbose=logger.level_debug,
               algo_params={
                   "batch_size_index": batch_size_index,
                   "batch_size_query": batch_size_query
               })
    nn.fit(a)

    cuD, cuI = nn.kneighbors(b)

    if metric not in sklearn.neighbors.VALID_METRICS_SPARSE['brute']:
        a = a.todense()
        b = b.todense()

    sknn = skKNN(metric=metric,
                 p=2.0,
                 n_neighbors=n_neighbors,
                 algorithm="brute",
                 n_jobs=-1)
    sk_X = a.get()
    sknn.fit(sk_X)

    skD, skI = sknn.kneighbors(b.get())

    # For some reason, this will occasionally fail w/ a single
    # mismatched element in CI. Allowing the single mismatch for now.
    cp.testing.assert_allclose(cuD, skD, atol=1e-5, rtol=1e-5)

    # Jaccard & Chebyshev have a high potential for mismatched indices
    # due to duplicate distances. We can ignore the indices in this case.
    if metric not in ['jaccard', 'chebyshev']:

        # The actual neighbors returned in the presence of duplicate distances
        # is non-deterministic. If we got to this point, the distances all
        # match between cuml and sklearn. We set a reasonable threshold
        # (.5% in this case) to allow differences from non-determinism.
        diffs = abs(cuI - skI)
        assert (len(diffs[diffs > 0]) / len(np.ravel(skI))) <= 0.005