Esempio n. 1
0
def test_dbscan_predict(datatype, input_type, use_handle, max_bytes_per_batch):

    # max_bytes_per_batch sizes: 10=6 batches, 200=2 batches, 2e6=1 batch

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)
    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle,
                        eps=3,
                        min_samples=2,
                        max_bytes_per_batch=max_bytes_per_batch)

    if input_type == 'dataframe':
        gdf = cudf.DataFrame()
        gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
        gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)
        cu_labels = cudbscan.fit_predict(gdf)
    else:
        cu_labels = cudbscan.fit_predict(X)
    cudbscan.handle.sync()

    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]
Esempio n. 2
0
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples):
    X, y = make_blobs(n_samples,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    handle, stream = get_handle(use_handle)
    eps = 0.5
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Esempio n. 3
0
def test_tsvd_fit(datatype, name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
        X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5])

    if name != 'blobs':
        sktsvd = skTSVD(n_components=1)
        sktsvd.fit(X)

    handle, stream = get_handle(use_handle)
    cutsvd = cuTSVD(n_components=1, handle=handle)

    cutsvd.fit(X)
    cutsvd.handle.sync()

    if name != 'blobs':
        for attr in ['singular_values_', 'components_',
                     'explained_variance_ratio_']:
            with_sign = False if attr in ['components_'] else True
            assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr),
                               0.4, with_sign=with_sign)
Esempio n. 4
0
def test_tsvd_fit_transform(datatype, name, use_handle):
    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
        X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5])

    if name != 'blobs':
        skpca = skTSVD(n_components=1)
        Xsktsvd = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cutsvd = cuTSVD(n_components=1, handle=handle)

    Xcutsvd = cutsvd.fit_transform(X)
    cutsvd.handle.sync()

    if name != 'blobs':
        assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
Esempio n. 5
0
def test_dbscan(datatype, input_type, use_handle,
                nrows, ncols, max_mbytes_per_batch, out_dtype):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
                      n_features=n_feats, random_state=0)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle, eps=1, min_samples=2,
                        max_mbytes_per_batch=max_mbytes_per_batch)

    if input_type == 'dataframe':
        X = pd.DataFrame(
            {'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cu_labels = cudbscan.fit_predict(X_cudf, out_dtype=out_dtype)
    else:
        cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = skdbscan.fit_predict(X)
        score = adjusted_rand_score(cu_labels, sk_labels)
        assert score == 1

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Esempio n. 6
0
def test_accuracy(nrows, ncols, n_info, datatype):

    use_handle = True
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=5)
    X_test = np.asarray(X[train_rows:, 0:]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(np.int32)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(np.int32)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=1.0,
                       n_bins=8,
                       split_algo=0,
                       split_criterion=0,
                       min_rows_per_node=2,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=-1)
    cuml_model.fit(X_train, y_train)
    cu_predict = cuml_model.predict(X_test)
    cu_acc = cu_acc_score(y_test, cu_predict)
    cu_acc_using_sk = sk_acc_score(y_test, cu_predict)
    # compare the accuracy of the two models
    assert array_equal(cu_acc, cu_acc_using_sk)
Esempio n. 7
0
def test_pca_fit_transform(datatype, input_type,
                           name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        Xskpca = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    X_cupca = cupca.fit_transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
        assert Xskpca.shape[0] == X_cupca.shape[0]
        assert Xskpca.shape[1] == X_cupca.shape[1]
Esempio n. 8
0
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle,
                        eps=1,
                        min_samples=2,
                        max_mbytes_per_batch=max_mbytes_per_batch,
                        output_type='numpy')

    cu_labels = cudbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        skdbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = skdbscan.fit_predict(X)
        score = adjusted_rand_score(cu_labels, sk_labels)
        assert score == 1

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Esempio n. 9
0
def test_dbscan(datatype, use_handle, nrows, ncols,
                max_mbytes_per_batch, out_dtype):
    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples, cluster_std=0.01,
                      n_features=n_feats, random_state=0)

    handle, stream = get_handle(use_handle)

    eps = 1
    cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Esempio n. 10
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000,
                          n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)
    cupca.fit(X)
    cupca.handle.sync()

    for attr in ['singular_values_', 'components_', 'explained_variance_',
                 'explained_variance_ratio_']:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))

        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Esempio n. 11
0
def test_pca_inverse_transform(datatype, input_type, name, use_handle):
    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    X_pd = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
    X_cudf = cudf.DataFrame.from_pandas(X_pd)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X_cupca = cupca.fit_transform(X_cudf)

    else:
        X_cupca = cupca.fit_transform(X)

    input_gdf = cupca.inverse_transform(X_cupca)
    cupca.handle.sync()

    assert array_equal(input_gdf, X, 1e-0, with_sign=True)
Esempio n. 12
0
def test_pca_fit_transform(datatype, input_type, name, use_handle):
    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        Xskpca = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        X_cupca = cupca.fit_transform(X_cudf)

    else:
        X_cupca = cupca.fit_transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=True)
Esempio n. 13
0
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
    use_handle = True
    num_treees = 50

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(n_bins=16, split_criterion=2,
                       min_rows_per_node=2, random_state=123, n_streams=1,
                       n_estimators=num_treees, handle=handle, max_leaves=-1,
                       max_depth=40, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)

    # predict using FIL
    if ((not fil_sparse_format or algo == 'tree_reorg' or
            algo == 'batch_tree_reorg') or
            fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        input_type = 'numpy'
        fil_model_preds = fil_model.predict(X_test,
                                            output_type=input_type)
        fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test))
        fil_model_r2 = r2_score(y_test, fil_model_preds,
                                convert_dtype=datatype)
        assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if X.shape[0] < 1000:  # mode != "stress":
            sk_model = skrfr(n_estimators=50, max_depth=40,
                             min_samples_split=2,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
Esempio n. 14
0
def test_rf_classification(small_clf, datatype, split_algo,
                           max_samples, max_features,
                           use_experimental_backend):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features, max_samples=max_samples,
                       n_bins=16, split_algo=split_algo, split_criterion=0,
                       min_samples_leaf=2, random_state=123, n_streams=1,
                       n_estimators=40, handle=handle, max_leaves=-1,
                       max_depth=16,
                       use_experimental_backend=use_experimental_backend)
    f = io.StringIO()
    with redirect_stdout(f):
        cuml_model.fit(X_train, y_train)
    captured_stdout = f.getvalue()
    if use_experimental_backend:
        is_fallback_used = False
        if split_algo != 1:
            assert ('Experimental backend does not yet support histogram ' +
                    'split algorithm' in captured_stdout)
            is_fallback_used = True
        if is_fallback_used:
            assert ('Not using the experimental backend due to above ' +
                    'mentioned reason(s)' in captured_stdout)
        else:
            assert ('Using experimental backend for growing trees'
                    in captured_stdout)
    else:
        assert captured_stdout == ''
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='auto')
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2, max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Esempio n. 15
0
def rf_classification(datatype, array_type, max_features, max_samples,
                      fixture):
    X, y = fixture
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    handle, stream = get_handle(True, n_streams=1)
    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features, max_samples=max_samples,
                       n_bins=16, split_criterion=0,
                       min_samples_leaf=2, random_state=123,
                       n_estimators=40, handle=handle, max_leaves=-1,
                       max_depth=16)
    if array_type == 'dataframe':
        X_train_df = cudf.DataFrame(X_train)
        y_train_df = cudf.Series(y_train)
        X_test_df = cudf.DataFrame(X_test)
        cuml_model.fit(X_train_df, y_train_df)
        cu_proba_gpu = np.array(cuml_model.predict_proba(X_test_df)
                                .as_gpu_matrix())
        cu_preds_cpu = cuml_model.predict(X_test_df,
                                          predict_model="CPU").to_array()
        cu_preds_gpu = cuml_model.predict(X_test_df,
                                          predict_model="GPU").to_array()
    else:
        cuml_model.fit(X_train, y_train)
        cu_proba_gpu = cuml_model.predict_proba(X_test)
        cu_preds_cpu = cuml_model.predict(X_test, predict_model="CPU")
        cu_preds_gpu = cuml_model.predict(X_test, predict_model="GPU")
    np.testing.assert_array_equal(cu_preds_gpu,
                                  np.argmax(cu_proba_gpu, axis=1))

    cu_acc_cpu = accuracy_score(y_test, cu_preds_cpu)
    cu_acc_gpu = accuracy_score(y_test, cu_preds_gpu)
    assert cu_acc_cpu == pytest.approx(cu_acc_gpu, abs=0.01, rel=0.1)

    # sklearn random forest classification model
    # initialization, fit and predict
    if y.size < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2, max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        sk_proba = sk_model.predict_proba(X_test)
        assert cu_acc_cpu >= sk_acc - 0.07
        assert cu_acc_gpu >= sk_acc - 0.07
        # 0.06 is the highest relative error observed on CI, within
        # 0.0061 absolute error boundaries seen previously
        check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)
Esempio n. 16
0
def test_rf_classification_proba(datatype, split_algo, rows_sample, nrows,
                                 column_info, max_features):
    use_handle = True
    ncols, n_info = column_info

    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds_proba = cuml_model.predict_proba(X_test,
                                               output_class=True,
                                               threshold=0.5,
                                               algo='auto')
    y_proba = np.zeros(np.shape(fil_preds_proba))
    y_proba[:, 1] = y_test
    y_proba[:, 0] = 1.0 - y_test
    fil_mse = mean_squared_error(y_proba, fil_preds_proba)
    if nrows < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds_proba = sk_model.predict_proba(X_test)
        sk_mse = mean_squared_error(y_proba, sk_preds_proba)
        # Max difference of 0.0061 is seen between the mse values of
        # predict proba function of fil and sklearn
        assert fil_mse <= (sk_mse + 0.0061)
Esempio n. 17
0
def test_rf_classification(datatype, split_algo, rows_sample, nrows,
                           column_info, max_features):
    use_handle = True
    ncols, n_info = column_info

    X, y = make_classification(n_samples=nrows,
                               n_features=ncols,
                               n_clusters_per_class=1,
                               n_informative=n_info,
                               random_state=123,
                               n_classes=2)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='BATCH_TREE_REORG')
    cu_predict = cuml_model.predict(X_test, predict_model="CPU")
    cuml_acc = accuracy_score(y_test, cu_predict)
    fil_acc = accuracy_score(y_test, fil_preds)

    if nrows < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_predict)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Esempio n. 18
0
def test_rf_regression(
    special_reg, datatype, max_features, max_samples, n_bins
):

    use_handle = True

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=n_bins,
        split_criterion=2,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=50,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
        accuracy_metric="mse",
    )
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if X.shape[0] < 1000:  # mode != "stress"
        sk_model = skrfr(
            n_estimators=50,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)
Esempio n. 19
0
def test_r2_score(datatype, use_handle):
    a = np.array([0.1, 0.2, 0.3, 0.4, 0.5], dtype=datatype)
    b = np.array([0.12, 0.22, 0.32, 0.42, 0.52], dtype=datatype)

    a_dev = cuda.to_device(a)
    b_dev = cuda.to_device(b)

    handle, stream = get_handle(use_handle)

    score = cuml.metrics.r2_score(a_dev, b_dev, handle=handle)

    np.testing.assert_almost_equal(score, 0.98, decimal=7)
Esempio n. 20
0
def test_entropy(use_handle):
    handle, stream = get_handle(use_handle)

    # The outcome of a fair coin is the most uncertain:
    # in base 2 the result is 1 (One bit of entropy).
    cluster = np.array([0, 1], dtype=np.int32)
    assert_almost_equal(entropy(cluster, base=2., handle=handle), 1.)

    # The outcome of a biased coin is less uncertain:
    cluster = np.array(([0] * 9) + [1], dtype=np.int32)
    assert_almost_equal(entropy(cluster, base=2., handle=handle), 0.468995593)
    # base e
    assert_almost_equal(entropy(cluster, handle=handle), 0.32508297339144826)
Esempio n. 21
0
def test_rf_printing(capfd, n_estimators, detailed_printing):

    X, y = make_classification(n_samples=500,
                               n_features=10,
                               n_clusters_per_class=1,
                               n_informative=5,
                               random_state=94929,
                               n_classes=2)

    X = X.astype(np.float32)
    y = y.astype(np.int32)

    # Create a handle for the cuml model
    handle, stream = get_handle(True, n_streams=1)

    # Initialize cuML Random Forest classification model
    cuml_model = curfc(handle=handle,
                       max_features=1.0,
                       rows_sample=1.0,
                       n_bins=16,
                       split_algo=0,
                       split_criterion=0,
                       min_rows_per_node=2,
                       random_state=23707,
                       n_streams=1,
                       n_estimators=n_estimators,
                       max_leaves=-1,
                       max_depth=16)

    # Train model on the data
    cuml_model.fit(X, y)

    if detailed_printing:
        cuml_model.print_detailed()
    else:
        cuml_model.print_summary()

    # Read the captured output
    printed_output = capfd.readouterr().out

    # Test 1: Output is non-zero
    assert '' != printed_output

    # Count the number of trees printed
    tree_count = 0
    for line in printed_output.split('\n'):
        if line.strip().startswith('Tree #'):
            tree_count += 1

    # Test 2: Correct number of trees are printed
    assert n_estimators == tree_count
Esempio n. 22
0
def test_rf_get_text(n_estimators, detailed_text):

    X, y = make_classification(
        n_samples=500,
        n_features=10,
        n_clusters_per_class=1,
        n_informative=5,
        random_state=94929,
        n_classes=2,
    )

    X = X.astype(np.float32)
    y = y.astype(np.int32)

    # Create a handle for the cuml model
    handle, stream = get_handle(True, n_streams=1)

    # Initialize cuML Random Forest classification model
    cuml_model = curfc(
        handle=handle,
        max_features=1.0,
        max_samples=1.0,
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=23707,
        n_streams=1,
        n_estimators=n_estimators,
        max_leaves=-1,
        max_depth=16,
    )

    # Train model on the data
    cuml_model.fit(X, y)

    if detailed_text:
        text_output = cuml_model.get_detailed_text()
    else:
        text_output = cuml_model.get_summary_text()

    # Test 1: Output is non-zero
    assert "" != text_output

    # Count the number of trees printed
    tree_count = 0
    for line in text_output.split("\n"):
        if line.strip().startswith("Tree #"):
            tree_count += 1

    # Test 2: Correct number of trees are printed
    assert n_estimators == tree_count
Esempio n. 23
0
def test_dbscan_predict_multiple_streams():
    datatype = np.float32
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
    gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)

    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)

    handle1, stream1 = get_handle(True)
    handle2, stream2 = get_handle(True)
    cudbscan1 = cuDBSCAN(handle=handle1, eps=3, min_samples=2)
    cudbscan2 = cuDBSCAN(handle=handle2, eps=3, min_samples=2)
    cu_labels1 = cudbscan1.fit_predict(gdf)
    cu_labels2 = cudbscan2.fit_predict(gdf)
    cudbscan1.handle.sync()
    cudbscan2.handle.sync()
    for i in range(X.shape[0]):
        assert cu_labels1[i] == sk_labels[i]
        assert cu_labels2[i] == sk_labels[i]
Esempio n. 24
0
def test_rf_classification(small_clf, datatype, max_samples, max_features):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=40,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
    )
    cuml_model.fit(X_train, y_train)

    fil_preds = cuml_model.predict(
        X_test, predict_model="GPU", threshold=0.5, algo="auto"
    )
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(
            n_estimators=40,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.07)  # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
Esempio n. 25
0
def test_rf_classification(small_clf, datatype, split_algo, rows_sample,
                           max_features):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=0,
                       min_rows_per_node=2,
                       random_state=123,
                       n_streams=1,
                       n_estimators=40,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16)
    cuml_model.fit(X_train, y_train)
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   output_class=True,
                                   threshold=0.5,
                                   algo='auto')
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(n_estimators=40,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (cuml_acc - 0.02)
Esempio n. 26
0
def test_rf_regression(datatype, split_algo, rows_sample,
                       n_info, mode, ncols, max_features):
    use_handle = True

    if mode == 'unit':
        X, y = make_regression(n_samples=100, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    train_rows = np.int32(X.shape[0]*0.8)
    X_test = np.asarray(X[train_rows:, :]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(datatype)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=8)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features, rows_sample=rows_sample,
                       n_bins=16, split_algo=split_algo, split_criterion=2,
                       min_rows_per_node=2,
                       n_estimators=50, handle=handle, max_leaves=-1,
                       max_depth=16, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds)
    fil_r2 = r2_score(y_test, fil_preds)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    sk_model = skrfr(n_estimators=50, max_depth=16,
                     min_samples_split=2, max_features=max_features,
                     random_state=10)
    sk_model.fit(X_train, y_train)
    sk_predict = sk_model.predict(X_test)
    sk_r2 = r2_score(y_test, sk_predict)
    print(fil_r2, cu_r2, sk_r2)
    assert fil_r2 >= (cu_r2 - 0.02)
    assert fil_r2 >= (sk_r2 - 0.07)
Esempio n. 27
0
def test_entropy_random(n_samples, base, use_handle):
    handle, stream = get_handle(use_handle)

    clustering, _ = \
        generate_random_labels(lambda rng: rng.randint(0, 1000, n_samples))

    # generate unormalized probabilities from clustering
    pk = np.bincount(clustering)

    # scipy's entropy uses probabilities
    sp_S = sp_entropy(pk, base=base)
    # we use a clustering
    S = entropy(np.array(clustering, dtype=np.int32), base, handle=handle)

    assert_almost_equal(S, sp_S, decimal=2)
Esempio n. 28
0
def test_rf_regression(datatype, use_handle, split_algo,
                       n_info, mode, ncols,
                       rows_sample):

    if mode == 'unit':
        X, y = make_regression(n_samples=30, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    train_rows = np.int32(X.shape[0]*0.8)
    X_test = np.asarray(X[train_rows:, :]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(datatype)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr(max_features=1.0, rows_sample=rows_sample,
                       n_bins=8, split_algo=split_algo, split_criterion=2,
                       min_rows_per_node=2,
                       n_estimators=50, handle=handle, max_leaves=-1,
                       max_depth=25, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    cu_mse = cuml_model.score(X_test, y_test)
    if mode != 'stress':
        # sklearn random forest classification model
        # initialization, fit and predict
        sk_model = skrfr(n_estimators=50, max_depth=50,
                         min_samples_split=2, max_features=1.0,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_mse = mean_squared_error(y_test, sk_predict)

        # compare the accuracy of the two models
        assert cu_mse <= (sk_mse + 0.07)
Esempio n. 29
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]],
                     dtype=datatype)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    if input_type == 'dataframe':
        X = pd.DataFrame({'fea%d' % i: X[0:, i] for i in range(X.shape[1])})
        X_cudf = cudf.DataFrame.from_pandas(X)
        cupca.fit(X_cudf)

    else:
        cupca.fit(X)

    cupca.handle.sync()

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_', 'noise_variance_'
    ]:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))
        if isinstance(cuml_res, cudf.Series):
            cuml_res = cuml_res.to_array()
        else:
            cuml_res = cuml_res.as_matrix()
        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign)
Esempio n. 30
0
def test_dbscan_predict_numpy(datatype, use_handle):
    gdf = cudf.DataFrame()
    gdf['0'] = np.asarray([1, 2, 2, 8, 8, 25], dtype=datatype)
    gdf['1'] = np.asarray([2, 2, 3, 7, 8, 80], dtype=datatype)

    X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]],
                 dtype=datatype)

    print("Calling fit_predict")
    handle, stream = get_handle(use_handle)
    cudbscan = cuDBSCAN(handle=handle, eps=3, min_samples=2)
    cu_labels = cudbscan.fit_predict(gdf)
    skdbscan = skDBSCAN(eps=3, min_samples=2)
    sk_labels = skdbscan.fit_predict(X)
    print(X.shape[0])
    cudbscan.handle.sync()
    for i in range(X.shape[0]):
        assert cu_labels[i] == sk_labels[i]