Exemple #1
0
def test_tsvd_fit_transform(datatype, name, use_handle):
    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
        X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5])

    if name != 'blobs':
        skpca = skTSVD(n_components=1)
        Xsktsvd = skpca.fit_transform(X)

    handle, stream = get_handle(use_handle)
    cutsvd = cuTSVD(n_components=1, handle=handle)

    Xcutsvd = cutsvd.fit_transform(X)
    cutsvd.handle.sync()

    if name != 'blobs':
        assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
Exemple #2
0
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples):
    X, y = make_blobs(n_samples,
                      centers=1,
                      cluster_std=8.0,
                      center_box=(-100.0, 100.0),
                      random_state=8)
    X = X.astype(datatype)

    handle, stream = get_handle(use_handle)
    eps = 0.5
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=5,
                           output_type='numpy')
    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    sk_dbscan = skDBSCAN(eps=eps, min_samples=5)
    sk_labels = sk_dbscan.fit_predict(X)

    # Check the core points are equal
    assert array_equal(cuml_dbscan.core_sample_indices_,
                       sk_dbscan.core_sample_indices_)

    # Check the labels are correct
    assert_dbscan_equal(sk_labels, cu_labels, X,
                        cuml_dbscan.core_sample_indices_, eps)
Exemple #3
0
def test_rf_regression(special_reg, datatype, max_features, max_samples,
                       n_bins):

    use_handle = True

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=n_bins,
        split_criterion=2,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=50,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
        accuracy_metric="mse",
    )
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if X.shape[0] < 1000:  # mode != "stress"
        sk_model = skrfr(
            n_estimators=50,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)
Exemple #4
0
def test_rf_classification(small_clf, datatype, max_samples, max_features):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=40,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
    )
    cuml_model.fit(X_train, y_train)

    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   threshold=0.5,
                                   algo="auto")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    cuml_acc = accuracy_score(y_test, cu_preds)
    fil_acc = accuracy_score(y_test, fil_preds)
    if X.shape[0] < 500000:
        sk_model = skrfc(
            n_estimators=40,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        assert fil_acc >= (sk_acc - 0.07)
    assert fil_acc >= (
        cuml_acc - 0.07
    )  # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
Exemple #5
0
def test_rf_get_text(n_estimators, detailed_text):

    X, y = make_classification(
        n_samples=500,
        n_features=10,
        n_clusters_per_class=1,
        n_informative=5,
        random_state=94929,
        n_classes=2,
    )

    X = X.astype(np.float32)
    y = y.astype(np.int32)

    # Create a handle for the cuml model
    handle, stream = get_handle(True, n_streams=1)

    # Initialize cuML Random Forest classification model
    cuml_model = curfc(
        handle=handle,
        max_features=1.0,
        max_samples=1.0,
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=23707,
        n_streams=1,
        n_estimators=n_estimators,
        max_leaves=-1,
        max_depth=16,
    )

    # Train model on the data
    cuml_model.fit(X, y)

    if detailed_text:
        text_output = cuml_model.get_detailed_text()
    else:
        text_output = cuml_model.get_summary_text()

    # Test 1: Output is non-zero
    assert "" != text_output

    # Count the number of trees printed
    tree_count = 0
    for line in text_output.split("\n"):
        if line.strip().startswith("Tree #"):
            tree_count += 1

    # Test 2: Correct number of trees are printed
    assert n_estimators == tree_count
Exemple #6
0
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch,
                out_dtype):
    if nrows == 500000 and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            nrows = nrows * pytest.max_gpu_memory // 32
        else:
            pytest.skip("Insufficient GPU memory for this test. "
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    n_samples = nrows
    n_feats = ncols
    X, y = make_blobs(n_samples=n_samples,
                      cluster_std=0.01,
                      n_features=n_feats,
                      random_state=0)

    handle, stream = get_handle(use_handle)

    eps = 1
    cuml_dbscan = cuDBSCAN(handle=handle,
                           eps=eps,
                           min_samples=2,
                           max_mbytes_per_batch=max_mbytes_per_batch,
                           output_type='numpy')

    cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype)

    if nrows < 500000:
        sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute")
        sk_labels = sk_dbscan.fit_predict(X)

        # Check the core points are equal
        assert array_equal(cuml_dbscan.core_sample_indices_,
                           sk_dbscan.core_sample_indices_)

        # Check the labels are correct
        assert_dbscan_equal(sk_labels, cu_labels, X,
                            cuml_dbscan.core_sample_indices_, eps)

    if out_dtype == "int32" or out_dtype == np.int32:
        assert cu_labels.dtype == np.int32
    elif out_dtype == "int64" or out_dtype == np.int64:
        assert cu_labels.dtype == np.int64
Exemple #7
0
def test_rf_memory_leakage(small_clf, datatype, fil_sparse_format, n_iter):
    use_handle = True

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Warmup. Some modules that are used in RF allocate space on the device
    # and consume memory. This is to make sure that the allocation is done
    # before the first call to get_memory_info.
    base_model = curfc(handle=handle)
    base_model.fit(X_train, y_train)
    handle.sync()  # just to be sure
    free_mem = cuda.current_context().get_memory_info()[0]

    def test_for_memory_leak():
        cuml_mods = curfc(handle=handle)
        cuml_mods.fit(X_train, y_train)
        handle.sync()  # just to be sure
        # Calculate the memory free after fitting the cuML model
        delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
        assert delta_mem == 0

        for i in range(2):
            cuml_mods.predict(
                X_test,
                predict_model="GPU",
                fil_sparse_format=fil_sparse_format,
            )
            handle.sync()  # just to be sure
            # Calculate the memory free after predicting the cuML model
            delta_mem = free_mem - cuda.current_context().get_memory_info()[0]
            assert delta_mem == 0

    for i in range(n_iter):
        test_for_memory_leak()
Exemple #8
0
def test_pca_inverse_transform(datatype, input_type, name, use_handle, nrows):
    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    else:
        rng = np.random.RandomState(0)
        n, p = nrows, 3
        X = rng.randn(n, p)  # spherical data
        X[:, 1] *= .00001  # make middle component relatively small
        X += [3, 4, 2]  # make a large mean

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    X_cupca = cupca.fit_transform(X)

    input_gdf = cupca.inverse_transform(X_cupca)
    cupca.handle.sync()
    assert array_equal(input_gdf, X, 5e-5, with_sign=True)
Exemple #9
0
def test_pca_fit_then_transform(datatype, input_type, name, use_handle):
    blobs_n_samples = 500000
    if name == 'blobs' and pytest.max_gpu_memory < 32:
        if pytest.adapt_stress_test:
            blobs_n_samples = int(blobs_n_samples * pytest.max_gpu_memory / 32)
        else:
            pytest.skip("Insufficient GPU memory for this test."
                        "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'")

    if name == 'blobs':
        X, y = make_blobs(n_samples=blobs_n_samples,
                          n_features=1000,
                          random_state=0)

    elif name == 'iris':
        iris = datasets.load_iris()
        X = iris.data

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    if name != 'blobs':
        skpca = skPCA(n_components=2)
        skpca.fit(X)
        Xskpca = skpca.transform(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)

    cupca.fit(X)
    X_cupca = cupca.transform(X)
    cupca.handle.sync()

    if name != 'blobs':
        assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=False)
        assert Xskpca.shape[0] == X_cupca.shape[0]
        assert Xskpca.shape[1] == X_cupca.shape[1]
Exemple #10
0
def test_tsvd_fit(datatype, name, use_handle):

    if name == 'blobs':
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'random':
        pytest.skip('fails when using random dataset '
                    'used by sklearn for testing')
        shape = 5000, 100
        rng = check_random_state(42)
        X = rng.randint(-100, 20, np.product(shape)).reshape(shape)

    else:
        n, p = 500, 5
        rng = np.random.RandomState(0)
        X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5])

    if name != 'blobs':
        sktsvd = skTSVD(n_components=1)
        sktsvd.fit(X)

    handle, stream = get_handle(use_handle)
    cutsvd = cuTSVD(n_components=1, handle=handle)

    cutsvd.fit(X)
    cutsvd.handle.sync()

    if name != 'blobs':
        for attr in [
                'singular_values_', 'components_', 'explained_variance_ratio_'
        ]:
            with_sign = False if attr in ['components_'] else True
            assert array_equal(getattr(cutsvd, attr),
                               getattr(sktsvd, attr),
                               0.4,
                               with_sign=with_sign)
Exemple #11
0
def test_pca_fit(datatype, input_type, name, use_handle):

    if name == 'blobs':
        pytest.skip('fails when using blobs dataset')
        X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0)

    elif name == 'digits':
        X, _ = datasets.load_digits(return_X_y=True)

    else:
        X, Y = make_multilabel_classification(n_samples=500,
                                              n_classes=2,
                                              n_labels=1,
                                              allow_unlabeled=False,
                                              random_state=1)

    skpca = skPCA(n_components=2)
    skpca.fit(X)

    handle, stream = get_handle(use_handle)
    cupca = cuPCA(n_components=2, handle=handle)
    cupca.fit(X)
    cupca.handle.sync()

    for attr in [
            'singular_values_', 'components_', 'explained_variance_',
            'explained_variance_ratio_'
    ]:
        with_sign = False if attr in ['components_'] else True
        print(attr)
        print(getattr(cupca, attr))
        print(getattr(skpca, attr))
        cuml_res = (getattr(cupca, attr))

        skl_res = getattr(skpca, attr)
        assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
Exemple #12
0
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
    use_handle = True
    num_treees = 50

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(
        n_bins=16,
        split_criterion=2,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=num_treees,
        handle=handle,
        max_leaves=-1,
        max_depth=40,
        accuracy_metric="mse",
    )
    cuml_model.fit(X_train, y_train)

    # predict using FIL
    if (not fil_sparse_format or algo == "tree_reorg" or algo
            == "batch_tree_reorg") or fil_sparse_format == "not_supported":
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(
                X_test,
                predict_model="GPU",
                fil_sparse_format=fil_sparse_format,
                algo=algo,
            )
    else:
        fil_preds = cuml_model.predict(
            X_test,
            predict_model="GPU",
            fil_sparse_format=fil_sparse_format,
            algo=algo,
        )
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        with cuml.using_output_type("numpy"):
            fil_model_preds = fil_model.predict(X_test)
            fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test))
            fil_model_r2 = r2_score(y_test,
                                    fil_model_preds,
                                    convert_dtype=datatype)
            assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if X.shape[0] < 1000:  # mode != "stress":
            sk_model = skrfr(
                n_estimators=50,
                max_depth=40,
                min_samples_split=2,
                random_state=10,
            )
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
Exemple #13
0
def test_rf_classification_sparse(small_clf, datatype, fil_sparse_format,
                                  algo):
    use_handle = True
    num_treees = 50

    X, y = small_clf
    X = X.astype(datatype)
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=num_treees,
        handle=handle,
        max_leaves=-1,
        max_depth=40,
    )
    cuml_model.fit(X_train, y_train)

    if (not fil_sparse_format or algo == "tree_reorg" or algo
            == "batch_tree_reorg") or fil_sparse_format == "not_supported":
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(
                X_test,
                predict_model="GPU",
                threshold=0.5,
                fil_sparse_format=fil_sparse_format,
                algo=algo,
            )
    else:
        fil_preds = cuml_model.predict(
            X_test,
            predict_model="GPU",
            threshold=0.5,
            fil_sparse_format=fil_sparse_format,
            algo=algo,
        )
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_acc = accuracy_score(y_test, fil_preds)
        np.testing.assert_almost_equal(fil_acc,
                                       cuml_model.score(X_test, y_test))

        fil_model = cuml_model.convert_to_fil_model()

        with cuml.using_output_type("numpy"):
            fil_model_preds = fil_model.predict(X_test)
            fil_model_acc = accuracy_score(y_test, fil_model_preds)
            assert fil_acc == fil_model_acc

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        if X.shape[0] < 500000:
            sk_model = skrfc(
                n_estimators=50,
                max_depth=40,
                min_samples_split=2,
                random_state=10,
            )
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_acc = accuracy_score(y_test, sk_preds)
            assert fil_acc >= (sk_acc - 0.07)
Exemple #14
0
def rf_classification(datatype, array_type, max_features, max_samples,
                      fixture):
    X, y = fixture
    X = X.astype(datatype[0])
    y = y.astype(np.int32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_test = X_test.astype(datatype[1])

    n_streams = 1
    handle, stream = get_handle(True, n_streams=n_streams)
    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfc(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=16,
        split_criterion=0,
        min_samples_leaf=2,
        random_state=123,
        n_estimators=40,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
        n_streams=n_streams,
    )
    if array_type == "dataframe":
        X_train_df = cudf.DataFrame(X_train)
        y_train_df = cudf.Series(y_train)
        X_test_df = cudf.DataFrame(X_test)
        cuml_model.fit(X_train_df, y_train_df)
        cu_proba_gpu = cuml_model.predict_proba(X_test_df).to_numpy()
        cu_preds_cpu = cuml_model.predict(X_test_df,
                                          predict_model="CPU").to_numpy()
        cu_preds_gpu = cuml_model.predict(X_test_df,
                                          predict_model="GPU").to_numpy()
    else:
        cuml_model.fit(X_train, y_train)
        cu_proba_gpu = cuml_model.predict_proba(X_test)
        cu_preds_cpu = cuml_model.predict(X_test, predict_model="CPU")
        cu_preds_gpu = cuml_model.predict(X_test, predict_model="GPU")
    np.testing.assert_array_equal(cu_preds_gpu, np.argmax(cu_proba_gpu,
                                                          axis=1))

    cu_acc_cpu = accuracy_score(y_test, cu_preds_cpu)
    cu_acc_gpu = accuracy_score(y_test, cu_preds_gpu)
    assert cu_acc_cpu == pytest.approx(cu_acc_gpu, abs=0.01, rel=0.1)

    # sklearn random forest classification model
    # initialization, fit and predict
    if y.size < 500000:
        sk_model = skrfc(
            n_estimators=40,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_acc = accuracy_score(y_test, sk_preds)
        sk_proba = sk_model.predict_proba(X_test)
        assert cu_acc_cpu >= sk_acc - 0.07
        assert cu_acc_gpu >= sk_acc - 0.07
        # 0.06 is the highest relative error observed on CI, within
        # 0.0061 absolute error boundaries seen previously
        check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)