Ejemplo n.º 1
0
def test_rf_regression_float64(large_reg, datatype):

    X, y = large_reg
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    fil_preds = cuml_model.predict(X_test,
                                   predict_model="GPU",
                                   convert_dtype=True)
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
    assert fil_r2 >= (cu_r2 - 0.02)
Ejemplo n.º 2
0
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo):
    use_handle = True
    num_treees = 50

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(n_bins=16, split_criterion=2,
                       min_rows_per_node=2, random_state=123, n_streams=1,
                       n_estimators=num_treees, handle=handle, max_leaves=-1,
                       max_depth=40, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)

    # predict using FIL
    if ((not fil_sparse_format or algo == 'tree_reorg' or
            algo == 'batch_tree_reorg') or
            fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test, predict_model="GPU",
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(y_test))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        input_type = 'numpy'
        fil_model_preds = fil_model.predict(X_test,
                                            output_type=input_type)
        fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test))
        fil_model_r2 = r2_score(y_test, fil_model_preds,
                                convert_dtype=datatype)
        assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert X.shape[1] == tl_model.num_features

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if X.shape[0] < 1000:  # mode != "stress":
            sk_model = skrfr(n_estimators=50, max_depth=40,
                             min_samples_split=2,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
Ejemplo n.º 3
0
def test_rf_regression(
    special_reg, datatype, max_features, max_samples, n_bins
):

    use_handle = True

    X, y = special_reg
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(
        max_features=max_features,
        max_samples=max_samples,
        n_bins=n_bins,
        split_criterion=2,
        min_samples_leaf=2,
        random_state=123,
        n_streams=1,
        n_estimators=50,
        handle=handle,
        max_leaves=-1,
        max_depth=16,
        accuracy_metric="mse",
    )
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if X.shape[0] < 1000:  # mode != "stress"
        sk_model = skrfr(
            n_estimators=50,
            max_depth=16,
            min_samples_split=2,
            max_features=max_features,
            random_state=10,
        )
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)
Ejemplo n.º 4
0
def test_rf_regression_float64(large_reg, datatype):

    X, y = large_reg
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.8, random_state=0
    )
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if X.shape[0] < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32:
        fil_preds = cuml_model.predict(
            X_test, predict_model="GPU", convert_dtype=True
        )
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
        assert fil_r2 >= (cu_r2 - 0.02)

    #  because datatype[0] != np.float32 or datatype[0] != datatype[1]
    # display warning when GPU-predict cannot be used and revert to CPU-predict
    elif datatype[1] == np.float64:
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            fil_preds = cuml_model.predict(
                X_test, predict_model="GPU"
                )
            assert("GPU based predict only accepts "
                   "np.float32 data. The model was "
                   "trained on np.float64 data hence "
                   "cannot use GPU-based prediction! "
                   "\nDefaulting to CPU-based Prediction. "
                   "\nTo predict on float-64 data, set "
                   "parameter predict_model = 'CPU'"
                   in str(w[-1].message))
Ejemplo n.º 5
0
def test_rf_regression(datatype, split_algo, rows_sample,
                       n_info, mode, ncols, max_features):
    use_handle = True

    if mode == 'unit':
        X, y = make_regression(n_samples=100, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    train_rows = np.int32(X.shape[0]*0.8)
    X_test = np.asarray(X[train_rows:, :]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(datatype)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=8)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features, rows_sample=rows_sample,
                       n_bins=16, split_algo=split_algo, split_criterion=2,
                       min_rows_per_node=2,
                       n_estimators=50, handle=handle, max_leaves=-1,
                       max_depth=16, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds)
    fil_r2 = r2_score(y_test, fil_preds)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    sk_model = skrfr(n_estimators=50, max_depth=16,
                     min_samples_split=2, max_features=max_features,
                     random_state=10)
    sk_model.fit(X_train, y_train)
    sk_predict = sk_model.predict(X_test)
    sk_r2 = r2_score(y_test, sk_predict)
    print(fil_r2, cu_r2, sk_r2)
    assert fil_r2 >= (cu_r2 - 0.02)
    assert fil_r2 >= (sk_r2 - 0.07)
Ejemplo n.º 6
0
def test_rf_regression_float64(datatype, column_info, nrows, convert_dtype):

    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=123)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)
    X_train = X_train.astype(datatype[0])
    y_train = y_train.astype(datatype[0])
    X_test = X_test.astype(datatype[1])
    y_test = y_test.astype(datatype[1])

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0])

    # sklearn random forest classification model
    # initialization, fit and predict
    if nrows < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0])
        assert cu_r2 >= (sk_r2 - 0.09)

    # predict using cuML's GPU based prediction
    if datatype[0] == np.float32 and convert_dtype:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       convert_dtype=convert_dtype)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0])
        assert fil_r2 >= (cu_r2 - 0.02)
    else:
        with pytest.raises(TypeError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           convert_dtype=convert_dtype)
Ejemplo n.º 7
0
def test_rf_regression(datatype, use_handle, split_algo,
                       n_info, mode, ncols,
                       rows_sample):

    if mode == 'unit':
        X, y = make_regression(n_samples=30, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000, n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    train_rows = np.int32(X.shape[0]*0.8)
    X_test = np.asarray(X[train_rows:, :]).astype(datatype)
    y_test = np.asarray(y[train_rows:, ]).astype(datatype)
    X_train = np.asarray(X[0:train_rows, :]).astype(datatype)
    y_train = np.asarray(y[0:train_rows, ]).astype(datatype)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr(max_features=1.0, rows_sample=rows_sample,
                       n_bins=8, split_algo=split_algo, split_criterion=2,
                       min_rows_per_node=2,
                       n_estimators=50, handle=handle, max_leaves=-1,
                       max_depth=25, accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    cu_mse = cuml_model.score(X_test, y_test)
    if mode != 'stress':
        # sklearn random forest classification model
        # initialization, fit and predict
        sk_model = skrfr(n_estimators=50, max_depth=50,
                         min_samples_split=2, max_features=1.0,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_mse = mean_squared_error(y_test, sk_predict)

        # compare the accuracy of the two models
        assert cu_mse <= (sk_mse + 0.07)
Ejemplo n.º 8
0
def test_rf_regression_default(datatype, column_info, nrows):

    ncols, n_info = column_info
    X, y = make_regression(n_samples=nrows,
                           n_features=ncols,
                           n_informative=n_info,
                           random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Initialize, fit and predict using cuML's
    # random forest classification model
    cuml_model = curfr()
    cuml_model.fit(X_train, y_train)

    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)

    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    fil_preds = np.reshape(fil_preds, np.shape(cu_preds))

    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

    # score function should be equivalent
    score_mse = cuml_model.score(X_test, y_test, predict_model="GPU")
    sk_mse = mean_squared_error(y_test, fil_preds)
    assert sk_mse == pytest.approx(score_mse)

    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if nrows < 500000:
        sk_model = skrfr(max_depth=16, random_state=10)
        sk_model.fit(X_train, y_train)
        sk_preds = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
        # XXX Accuracy gap exists with default parameters, requires
        # further investigation for next release
        assert fil_r2 >= (sk_r2 - 0.08)

    assert fil_r2 >= (cu_r2 - 0.02)
Ejemplo n.º 9
0
def test_rf_regression(datatype, split_algo, mode, column_info, max_features,
                       rows_sample):

    ncols, n_info = column_info
    use_handle = True

    if mode == 'unit':
        X, y = make_regression(n_samples=500,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=2,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=50,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=16,
                       accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    # predict using FIL
    fil_preds = cuml_model.predict(X_test, predict_model="GPU")
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)
    # Initialize, fit and predict using
    # sklearn's random forest regression model
    if mode != "stress":
        sk_model = skrfr(n_estimators=50,
                         max_depth=16,
                         min_samples_split=2,
                         max_features=max_features,
                         random_state=10)
        sk_model.fit(X_train, y_train)
        sk_predict = sk_model.predict(X_test)
        sk_r2 = r2_score(y_test, sk_predict, convert_dtype=datatype)
        assert fil_r2 >= (sk_r2 - 0.07)
    assert fil_r2 >= (cu_r2 - 0.02)
Ejemplo n.º 10
0
def test_rf_regression_sparse(datatype, split_algo, mode, column_info,
                              max_features, rows_sample, fil_sparse_format,
                              algo):

    ncols, n_info = column_info
    use_handle = True
    num_treees = 50

    if mode == 'unit':
        X, y = make_regression(n_samples=500,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)

    elif mode == 'quality':
        X, y = fetch_california_housing(return_X_y=True)

    else:
        X, y = make_regression(n_samples=100000,
                               n_features=ncols,
                               n_informative=n_info,
                               random_state=123)
    X = X.astype(datatype)
    y = y.astype(datatype)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=0.8,
                                                        random_state=0)

    # Create a handle for the cuml model
    handle, stream = get_handle(use_handle, n_streams=1)

    # Initialize and fit using cuML's random forest regression model
    cuml_model = curfr(max_features=max_features,
                       rows_sample=rows_sample,
                       n_bins=16,
                       split_algo=split_algo,
                       split_criterion=2,
                       min_rows_per_node=2,
                       seed=123,
                       n_streams=1,
                       n_estimators=num_treees,
                       handle=handle,
                       max_leaves=-1,
                       max_depth=40,
                       accuracy_metric='mse')
    cuml_model.fit(X_train, y_train)
    cu_preds = cuml_model.predict(X_test, predict_model="CPU")
    cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype)
    # predict using FIL
    if ((not fil_sparse_format or algo == 'tree_reorg'
         or algo == 'batch_tree_reorg')
            or fil_sparse_format == 'not_supported'):
        with pytest.raises(ValueError):
            fil_preds = cuml_model.predict(X_test,
                                           predict_model="GPU",
                                           fil_sparse_format=fil_sparse_format,
                                           algo=algo)
    else:
        fil_preds = cuml_model.predict(X_test,
                                       predict_model="GPU",
                                       fil_sparse_format=fil_sparse_format,
                                       algo=algo)
        fil_preds = np.reshape(fil_preds, np.shape(cu_preds))
        fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype)

        fil_model = cuml_model.convert_to_fil_model()

        input_type = 'numpy'
        fil_model_preds = fil_model.predict(X_test, output_type=input_type)
        fil_model_preds = np.reshape(fil_model_preds, np.shape(cu_preds))
        fil_model_r2 = r2_score(y_test,
                                fil_model_preds,
                                convert_dtype=datatype)
        assert fil_r2 == fil_model_r2

        tl_model = cuml_model.convert_to_treelite_model()
        assert num_treees == tl_model.num_trees
        assert ncols == tl_model.num_features
        del tl_model

        # Initialize, fit and predict using
        # sklearn's random forest regression model
        if mode != "stress":
            sk_model = skrfr(n_estimators=50,
                             max_depth=40,
                             min_samples_split=2,
                             max_features=max_features,
                             random_state=10)
            sk_model.fit(X_train, y_train)
            sk_preds = sk_model.predict(X_test)
            sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype)
            assert fil_r2 >= (sk_r2 - 0.07)
        assert fil_r2 >= (cu_r2 - 0.02)