Exemple #1
0
def test_tweedie_convergence(max_depth, split_criterion):
    np.random.seed(33)
    bootstrap = None
    max_features = 1.0
    n_estimators = 1
    min_impurity_decrease = 1e-5
    n_datapoints = 1000
    tweedie = {
        "poisson": {
            "power": 1,
            "gen": np.random.poisson,
            "args": [0.01]
        },
        "gamma": {
            "power": 2,
            "gen": np.random.gamma,
            "args": [2.0]
        },
        "inverse_gaussian": {
            "power": 3,
            "gen": np.random.wald,
            "args": [0.1, 2.0]
        }
    }
    # generating random dataset with tweedie distribution
    X = np.random.random((n_datapoints, 4)).astype(np.float32)
    y = tweedie[split_criterion]["gen"](*tweedie[split_criterion]["args"],
                                        size=n_datapoints).astype(np.float32)

    tweedie_preds = curfr(split_criterion=split_criterion,
                          max_depth=max_depth,
                          n_estimators=n_estimators,
                          bootstrap=bootstrap,
                          max_features=max_features,
                          min_impurity_decrease=min_impurity_decrease).fit(
                              X, y).predict(X)
    mse_preds = curfr(split_criterion=2,
                      max_depth=max_depth,
                      n_estimators=n_estimators,
                      bootstrap=bootstrap,
                      max_features=max_features,
                      min_impurity_decrease=min_impurity_decrease).fit(
                          X, y).predict(X)
    # y should not be non-positive for mean_poisson_deviance
    mask = mse_preds > 0
    mse_tweedie_deviance = mean_tweedie_deviance(
        y[mask], mse_preds[mask], power=tweedie[split_criterion]["power"])
    tweedie_tweedie_deviance = mean_tweedie_deviance(
        y[mask], tweedie_preds[mask], power=tweedie[split_criterion]["power"])

    # model trained on tweedie data with
    # tweedie criterion must perform better on tweedie loss
    assert mse_tweedie_deviance >= tweedie_tweedie_deviance
Exemple #2
0
def test_rf_host_memory_leak(large_clf, estimator_type):
    import gc
    import os

    try:
        import psutil
    except ImportError:
        pytest.skip("psutil not installed")

    process = psutil.Process(os.getpid())

    X, y = large_clf
    X = X.astype(np.float32)

    if estimator_type == 'classification':
        base_model = curfc(max_depth=10, n_estimators=100, seed=123)
        y = y.astype(np.int32)
    else:
        base_model = curfr(max_depth=10, n_estimators=100, seed=123)
        y = y.astype(np.float32)

    # Pre-fit once - this is our baseline and memory usage
    # should not significantly exceed it after later fits
    base_model.fit(X, y)
    gc.collect()
    initial_baseline_mem = process.memory_info().rss

    for i in range(5):
        base_model.fit(X, y)
        gc.collect()
        final_mem = process.memory_info().rss

    # Some tiny allocations may occur, but we shuld not leak
    # without bounds, which previously happened
    assert (final_mem - initial_baseline_mem) < 2e6
Exemple #3
0
def test_degenerate_cases():
    n_samples = 100
    cuml_model = curfr(max_features=1.0,
                       max_samples=0.1,
                       n_bins=128,
                       min_samples_leaf=2,
                       random_state=123,
                       n_streams=1,
                       n_estimators=10,
                       max_leaves=-1,
                       max_depth=16,
                       accuracy_metric="mse")
    # Attempt to import un-fitted model
    with pytest.raises(NotFittedError):
        TreeExplainer(model=cuml_model)

    # Depth 0 trees
    rng = np.random.default_rng(seed=0)
    X = rng.standard_normal(size=(n_samples, 8), dtype=np.float32)
    y = np.ones(shape=(n_samples, ), dtype=np.float32)
    cuml_model.fit(X, y)
    explainer = TreeExplainer(model=cuml_model)
    out = explainer.shap_values(X)
    # Since the output is always 1.0 no matter the input, SHAP values for all
    # features are zero, as feature values don't have any effect on the output.
    # The bias (expected_value) is 1.0.
    assert np.all(out == 0)
    assert explainer.expected_value == 1.0
Exemple #4
0
def test_rf_regression_float64(large_reg, datatype):

    X, y = large_reg
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   convert_dtype=True)
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
    assert fil_r2 >= (cu_r2 - 0.02)
Exemple #5
0
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
    use_handle = True
    num_treees = 50

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(n_bins=16, split_criterion=2,
                       min_rows_per_node=2, random_state=123, n_streams=1,
                       n_estimators=num_treees, handle=handle, max_leaves=-1,
                       max_depth=40, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)

    # predict using FIL
    if ((not fil_sparse_format or algo == 'tree_reorg' or
            algo == 'batch_tree_reorg') or
            fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        input_type = 'numpy'
        fil_model_preds = fil_model.predict(X_test,
                                            output_type=input_type)
        fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test))
        fil_model_r2 = r2_score(y_test, fil_model_preds,
                                convert_dtype=datatype)
        assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if X.shape[0] < 1000:  # mode != "stress":
            sk_model = skrfr(n_estimators=50, max_depth=40,
                             min_samples_split=2,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
Exemple #6
0
def test_concat_memory_leak(large_clf, estimator_type):
    import gc
    import os

    try:
        import psutil
    except ImportError:
        pytest.skip("psutil not installed")

    process = psutil.Process(os.getpid())

    X, y = large_clf
    X = X.astype(np.float32)

    # Build a series of RF models
    n_models = 10
    if estimator_type == 'classification':
        base_models = [
            curfc(max_depth=10, n_estimators=100, random_state=123)
            for i in range(n_models)
        ]
        y = y.astype(np.int32)
    elif estimator_type == 'regression':
        base_models = [
            curfr(max_depth=10, n_estimators=100, random_state=123)
            for i in range(n_models)
        ]
        y = y.astype(np.float32)
    else:
        assert False

    # Pre-fit once - this is our baseline and memory usage
    # should not significantly exceed it after later fits
    for model in base_models:
        model.fit(X, y)

    # Just concatenate over and over in a loop
    concat_models = base_models[1:]
    init_model = base_models[0]
    other_handles = [
        model._obtain_treelite_handle() for model in concat_models
    ]
    init_model._concatenate_treelite_handle(other_handles)

    gc.collect()
    initial_baseline_mem = process.memory_info().rss
    for i in range(10):
        init_model._concatenate_treelite_handle(other_handles)
        gc.collect()
        used_mem = process.memory_info().rss
        logger.debug("memory at rep %2d: %d m" %
                     (i, (used_mem - initial_baseline_mem) / 1e6))

    gc.collect()
    used_mem = process.memory_info().rss
    logger.info("Final memory delta: %d" %
                ((used_mem - initial_baseline_mem) / 1e6))
    assert (used_mem - initial_baseline_mem) < 1e6
Exemple #7
0
def test_rf_regression(
    special_reg, datatype, max_features, max_samples, n_bins
):

    use_handle = True

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=n_bins,
        split_criterion=2,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=50,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
        accuracy_metric="mse",
    )
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if X.shape[0] < 1000:  # mode != "stress"
        sk_model = skrfr(
            n_estimators=50,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)
Exemple #8
0
def test_rf_regression_float64(large_reg, datatype):

    X, y = large_reg
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32:
        fil_preds = cuml_model.predict(
            X_test, predict_model="GPU", convert_dtype=True
        )
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
        assert fil_r2 >= (cu_r2 - 0.02)

    #  because datatype[0] != np.float32 or datatype[0] != datatype[1]
    # display warning when GPU-predict cannot be used and revert to CPU-predict
    elif datatype[1] == np.float64:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            fil_preds = cuml_model.predict(
                X_test, predict_model="GPU"
                )
            assert("GPU based predict only accepts "
                   "np.float32 data. The model was "
                   "trained on np.float64 data hence "
                   "cannot use GPU-based prediction! "
                   "\nDefaulting to CPU-based Prediction. "
                   "\nTo predict on float-64 data, set "
                   "parameter predict_model = 'CPU'"
                   in str(w[-1].message))
Exemple #9
0
def test_rf_regressor_gtil_integration(tmpdir):
    X, y = load_boston(return_X_y=True)
    X, y = X.astype(np.float32), y.astype(np.float32)
    clf = curfr(max_depth=3, random_state=0, n_estimators=10)
    clf.fit(X, y)
    expected_pred = clf.predict(X)

    checkpoint_path = os.path.join(tmpdir, 'checkpoint.tl')
    clf.convert_to_treelite_model().to_treelite_checkpoint(checkpoint_path)

    tl_model = treelite.Model.deserialize(checkpoint_path)
    out_pred = treelite.gtil.predict(tl_model, X)
    np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5)
Exemple #10
0
def test_rf_regression(datatype, split_algo, rows_sample,
                       n_info, mode, ncols, max_features):
    use_handle = True

    if mode == 'unit':
        X, y = make_regression(n_samples=100, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    train_rows = np.int32(X.shape[0]*0.8)
    X_test = np.asarray(X[train_rows:, :]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(datatype)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=8)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features, rows_sample=rows_sample,
                       n_bins=16, split_algo=split_algo, split_criterion=2,
                       min_rows_per_node=2,
                       n_estimators=50, handle=handle, max_leaves=-1,
                       max_depth=16, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds)
    fil_r2 = r2_score(y_test, fil_preds)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    sk_model = skrfr(n_estimators=50, max_depth=16,
                     min_samples_split=2, max_features=max_features,
                     random_state=10)
    sk_model.fit(X_train, y_train)
    sk_predict = sk_model.predict(X_test)
    sk_r2 = r2_score(y_test, sk_predict)
    print(fil_r2, cu_r2, sk_r2)
    assert fil_r2 >= (cu_r2 - 0.02)
    assert fil_r2 >= (sk_r2 - 0.07)
Exemple #11
0
def test_rf_regression_with_identical_labels(split_criterion,
                                             use_experimental_backend):
    X = np.array([[-1, 0], [0, 1], [2, 0], [0, 3], [-2, 0]], dtype=np.float32)
    y = np.array([1, 1, 1, 1, 1], dtype=np.float32)
    # Degenerate case: all labels are identical.
    # RF Regressor must not create any split. It must yield an empty tree
    # with only the root node.
    clf = curfr(max_features=1.0, rows_sample=1.0, n_bins=5, split_algo=1,
                bootstrap=False, split_criterion=split_criterion,
                min_samples_leaf=1, min_samples_split=2, random_state=0,
                n_streams=1, n_estimators=1, max_depth=1,
                use_experimental_backend=use_experimental_backend)
    clf.fit(X, y)
    model_dump = json.loads(clf.get_json())
    assert len(model_dump) == 1
    assert model_dump[0] == {'nodeid': 0, 'leaf_value': 1.0}
Exemple #12
0
def test_rf_regression_float64(datatype, column_info, nrows, convert_dtype):

    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=123)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32 and convert_dtype:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       convert_dtype=convert_dtype)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
        assert fil_r2 >= (cu_r2 - 0.02)
    else:
        with pytest.raises(TypeError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           convert_dtype=convert_dtype)
Exemple #13
0
def test_rf_regression(datatype, use_handle, split_algo,
                       n_info, mode, ncols,
                       rows_sample):

    if mode == 'unit':
        X, y = make_regression(n_samples=30, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    train_rows = np.int32(X.shape[0]*0.8)
    X_test = np.asarray(X[train_rows:, :]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(datatype)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr(max_features=1.0, rows_sample=rows_sample,
                       n_bins=8, split_algo=split_algo, split_criterion=2,
                       min_rows_per_node=2,
                       n_estimators=50, handle=handle, max_leaves=-1,
                       max_depth=25, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    cu_mse = cuml_model.score(X_test, y_test)
    if mode != 'stress':
        # sklearn random forest classification model
        # initialization, fit and predict
        sk_model = skrfr(n_estimators=50, max_depth=50,
                         min_samples_split=2, max_features=1.0,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_mse = mean_squared_error(y_test, sk_predict)

        # compare the accuracy of the two models
        assert cu_mse <= (sk_mse + 0.07)
Exemple #14
0
def test_rf_regression_default(datatype, column_info, nrows):

    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)

    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)

    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

    # score function should be equivalent
    score_mse = cuml_model.score(X_test, y_test, predict_model="GPU")
    sk_mse = mean_squared_error(y_test, fil_preds)
    assert sk_mse == pytest.approx(score_mse)

    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if nrows < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
        # XXX Accuracy gap exists with default parameters, requires
        # further investigation for next release
        assert fil_r2 >= (sk_r2 - 0.08)

    assert fil_r2 >= (cu_r2 - 0.02)
Exemple #15
0
def test_cuml_rf_regressor(input_type):
    n_samples = 100
    X, y = make_regression(n_samples=n_samples,
                           n_features=8,
                           n_informative=8,
                           n_targets=1,
                           random_state=2021)
    X, y = X.astype(np.float32), y.astype(np.float32)
    if input_type == 'cupy':
        X, y = cp.array(X), cp.array(y)
    elif input_type == 'cudf':
        X, y = cudf.DataFrame(X), cudf.Series(y)
    cuml_model = curfr(max_features=1.0,
                       max_samples=0.1,
                       n_bins=128,
                       min_samples_leaf=2,
                       random_state=123,
                       n_streams=1,
                       n_estimators=10,
                       max_leaves=-1,
                       max_depth=16,
                       accuracy_metric="mse")
    cuml_model.fit(X, y)
    pred = cuml_model.predict(X)

    explainer = TreeExplainer(model=cuml_model)
    out = explainer.shap_values(X)
    if input_type == 'cupy':
        pred = pred.get()
        out = out.get()
        expected_value = explainer.expected_value.get()
    elif input_type == 'cudf':
        pred = pred.to_numpy()
        out = out.get()
        expected_value = explainer.expected_value.get()
    else:
        expected_value = explainer.expected_value
    # SHAP values should add up to predicted score
    shap_sum = np.sum(out, axis=1) + expected_value
    np.testing.assert_almost_equal(shap_sum, pred, decimal=4)
Exemple #16
0
def test_multiple_fits_regression(column_info, nrows, n_estimators, n_bins):
    datatype = np.float32
    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=123)
    X = X.astype(datatype)
    y = y.astype(np.int32)
    cuml_model = curfr(n_bins=n_bins, n_estimators=n_estimators, max_depth=10)

    # Calling multiple fits
    cuml_model.fit(X, y)

    cuml_model.fit(X, y)

    cuml_model.fit(X, y)

    # Check if params are still intact
    params = cuml_model.get_params()
    assert params['n_estimators'] == n_estimators
    assert params['n_bins'] == n_bins
Exemple #17
0
def test_rf_regression_with_identical_labels(split_criterion):
    X = np.array([[-1, 0], [0, 1], [2, 0], [0, 3], [-2, 0]], dtype=np.float32)
    y = np.array([1, 1, 1, 1, 1], dtype=np.float32)
    # Degenerate case: all labels are identical.
    # RF Regressor must not create any split. It must yield an empty tree
    # with only the root node.
    clf = curfr(
        max_features=1.0,
        max_samples=1.0,
        n_bins=5,
        bootstrap=False,
        split_criterion=split_criterion,
        min_samples_leaf=1,
        min_samples_split=2,
        random_state=0,
        n_streams=1,
        n_estimators=1,
        max_depth=1,
    )
    clf.fit(X, y)
    model_dump = json.loads(clf.get_json())
    assert len(model_dump) == 1
    expected_dump = {"nodeid": 0, "leaf_value": [1.0], "instance_count": 5}
    assert model_dump[0] == expected_dump
Exemple #18
0
def test_rf_get_json(estimator_type, max_depth, n_estimators):
    X, y = make_classification(n_samples=350, n_features=20,
                               n_clusters_per_class=1, n_informative=10,
                               random_state=123, n_classes=2)
    X = X.astype(np.float32)
    if estimator_type == 'classification':
        cuml_model = curfc(max_features=1.0, max_samples=1.0,
                           n_bins=16, split_algo=0, split_criterion=0,
                           min_samples_leaf=2, seed=23707, n_streams=1,
                           n_estimators=n_estimators, max_leaves=-1,
                           max_depth=max_depth)
        y = y.astype(np.int32)
    elif estimator_type == 'regression':
        cuml_model = curfr(max_features=1.0, max_samples=1.0,
                           n_bins=16, split_algo=0,
                           min_samples_leaf=2, seed=23707, n_streams=1,
                           n_estimators=n_estimators, max_leaves=-1,
                           max_depth=max_depth)
        y = y.astype(np.float32)
    else:
        assert False

    # Train model on the data
    cuml_model.fit(X, y)

    json_out = cuml_model.get_json()
    json_obj = json.loads(json_out)

    # Test 1: Output is non-zero
    assert '' != json_out

    # Test 2: JSON object contains correct number of trees
    assert isinstance(json_obj, list)
    assert len(json_obj) == n_estimators

    # Test 3: Traverse JSON trees and get the same predictions as cuML RF
    def predict_with_json_tree(tree, x):
        if 'children' not in tree:
            assert 'leaf_value' in tree
            return tree['leaf_value']
        assert 'split_feature' in tree
        assert 'split_threshold' in tree
        assert 'yes' in tree
        assert 'no' in tree
        if x[tree['split_feature']] <= tree['split_threshold']:
            return predict_with_json_tree(tree['children'][0], x)
        return predict_with_json_tree(tree['children'][1], x)

    def predict_with_json_rf_classifier(rf, x):
        # Returns the class with the highest vote. If there is a tie, return
        # the list of all classes with the highest vote.
        vote = []
        for tree in rf:
            vote.append(predict_with_json_tree(tree, x))
        vote = np.bincount(vote)
        max_vote = np.max(vote)
        majority_vote = np.nonzero(np.equal(vote, max_vote))[0]
        return majority_vote

    def predict_with_json_rf_regressor(rf, x):
        pred = 0.
        for tree in rf:
            pred += predict_with_json_tree(tree, x)
        return pred / len(rf)

    if estimator_type == 'classification':
        expected_pred = cuml_model.predict(X).astype(np.int32)
        for idx, row in enumerate(X):
            majority_vote = predict_with_json_rf_classifier(json_obj, row)
            assert expected_pred[idx] in majority_vote
    elif estimator_type == 'regression':
        expected_pred = cuml_model.predict(X).astype(np.float32)
        pred = []
        for idx, row in enumerate(X):
            pred.append(predict_with_json_rf_regressor(json_obj, row))
        pred = np.array(pred, dtype=np.float32)
        np.testing.assert_almost_equal(pred, expected_pred, decimal=6)
Exemple #19
0
def test_rf_regression_sparse(datatype, split_algo, mode, column_info,
                              max_features, rows_sample, fil_sparse_format,
                              algo):

    ncols, n_info = column_info
    use_handle = True
    num_treees = 50

    if mode == 'unit':
        X, y = make_regression(n_samples=500,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=2,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=num_treees,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=40,
                       accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    # predict using FIL
    if ((not fil_sparse_format or algo == 'tree_reorg'
         or algo == 'batch_tree_reorg')
            or fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        input_type = 'numpy'
        fil_model_preds = fil_model.predict(X_test, output_type=input_type)
        fil_model_preds = np.reshape(fil_model_preds, np.shape(cu_preds))
        fil_model_r2 = r2_score(y_test,
                                fil_model_preds,
                                convert_dtype=datatype)
        assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert ncols == tl_model.num_features
        del tl_model

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if mode != "stress":
            sk_model = skrfr(n_estimators=50,
                             max_depth=40,
                             min_samples_split=2,
                             max_features=max_features,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
        assert fil_r2 >= (cu_r2 - 0.02)
Exemple #20
0
def test_rf_get_json(estimator_type, max_depth, n_estimators):
    X, y = make_classification(
        n_samples=350,
        n_features=20,
        n_clusters_per_class=1,
        n_informative=10,
        random_state=123,
        n_classes=2,
    )
    X = X.astype(np.float32)
    if estimator_type == "classification":
        cuml_model = curfc(
            max_features=1.0,
            max_samples=1.0,
            n_bins=16,
            split_criterion=0,
            min_samples_leaf=2,
            random_state=23707,
            n_streams=1,
            n_estimators=n_estimators,
            max_leaves=-1,
            max_depth=max_depth,
        )
        y = y.astype(np.int32)
    elif estimator_type == "regression":
        cuml_model = curfr(
            max_features=1.0,
            max_samples=1.0,
            n_bins=16,
            min_samples_leaf=2,
            random_state=23707,
            n_streams=1,
            n_estimators=n_estimators,
            max_leaves=-1,
            max_depth=max_depth,
        )
        y = y.astype(np.float32)
    else:
        assert False

    # Train model on the data
    cuml_model.fit(X, y)

    json_out = cuml_model.get_json()
    json_obj = json.loads(json_out)

    # Test 1: Output is non-zero
    assert "" != json_out

    # Test 2: JSON object contains correct number of trees
    assert isinstance(json_obj, list)
    assert len(json_obj) == n_estimators

    # Test 3: Traverse JSON trees and get the same predictions as cuML RF
    def predict_with_json_tree(tree, x):
        if "children" not in tree:
            assert "leaf_value" in tree
            return tree["leaf_value"]
        assert "split_feature" in tree
        assert "split_threshold" in tree
        assert "yes" in tree
        assert "no" in tree
        if x[tree["split_feature"]] <= tree["split_threshold"] + 1e-5:
            return predict_with_json_tree(tree["children"][0], x)
        return predict_with_json_tree(tree["children"][1], x)

    def predict_with_json_rf_classifier(rf, x):
        # Returns the class with the highest vote. If there is a tie, return
        # the list of all classes with the highest vote.
        predictions = []
        for tree in rf:
            predictions.append(np.array(predict_with_json_tree(tree, x)))
        predictions = np.sum(predictions, axis=0)
        return np.argmax(predictions)

    def predict_with_json_rf_regressor(rf, x):
        pred = 0.0
        for tree in rf:
            pred += predict_with_json_tree(tree, x)[0]
        return pred / len(rf)

    if estimator_type == "classification":
        expected_pred = cuml_model.predict(X).astype(np.int32)
        for idx, row in enumerate(X):
            majority_vote = predict_with_json_rf_classifier(json_obj, row)
            assert expected_pred[idx] == majority_vote
    elif estimator_type == "regression":
        expected_pred = cuml_model.predict(X).astype(np.float32)
        pred = []
        for idx, row in enumerate(X):
            pred.append(predict_with_json_rf_regressor(json_obj, row))
        pred = np.array(pred, dtype=np.float32)
        print(json_obj)
        for i in range(len(pred)):
            assert np.isclose(pred[i], expected_pred[i]), X[i, 19]
        np.testing.assert_almost_equal(pred, expected_pred, decimal=6)
def test_rf_regression(datatype, split_algo, mode, column_info, max_features,
                       rows_sample):

    ncols, n_info = column_info
    use_handle = True

    if mode == 'unit':
        X, y = make_regression(n_samples=500,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=2,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=50,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16,
                       accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if mode != "stress":
        sk_model = skrfr(n_estimators=50,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_predict, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)