def test_tweedie_convergence(max_depth, split_criterion): np.random.seed(33) bootstrap = None max_features = 1.0 n_estimators = 1 min_impurity_decrease = 1e-5 n_datapoints = 1000 tweedie = { "poisson": { "power": 1, "gen": np.random.poisson, "args": [0.01] }, "gamma": { "power": 2, "gen": np.random.gamma, "args": [2.0] }, "inverse_gaussian": { "power": 3, "gen": np.random.wald, "args": [0.1, 2.0] } } # generating random dataset with tweedie distribution X = np.random.random((n_datapoints, 4)).astype(np.float32) y = tweedie[split_criterion]["gen"](*tweedie[split_criterion]["args"], size=n_datapoints).astype(np.float32) tweedie_preds = curfr(split_criterion=split_criterion, max_depth=max_depth, n_estimators=n_estimators, bootstrap=bootstrap, max_features=max_features, min_impurity_decrease=min_impurity_decrease).fit( X, y).predict(X) mse_preds = curfr(split_criterion=2, max_depth=max_depth, n_estimators=n_estimators, bootstrap=bootstrap, max_features=max_features, min_impurity_decrease=min_impurity_decrease).fit( X, y).predict(X) # y should not be non-positive for mean_poisson_deviance mask = mse_preds > 0 mse_tweedie_deviance = mean_tweedie_deviance( y[mask], mse_preds[mask], power=tweedie[split_criterion]["power"]) tweedie_tweedie_deviance = mean_tweedie_deviance( y[mask], tweedie_preds[mask], power=tweedie[split_criterion]["power"]) # model trained on tweedie data with # tweedie criterion must perform better on tweedie loss assert mse_tweedie_deviance >= tweedie_tweedie_deviance
def test_rf_host_memory_leak(large_clf, estimator_type): import gc import os try: import psutil except ImportError: pytest.skip("psutil not installed") process = psutil.Process(os.getpid()) X, y = large_clf X = X.astype(np.float32) if estimator_type == 'classification': base_model = curfc(max_depth=10, n_estimators=100, seed=123) y = y.astype(np.int32) else: base_model = curfr(max_depth=10, n_estimators=100, seed=123) y = y.astype(np.float32) # Pre-fit once - this is our baseline and memory usage # should not significantly exceed it after later fits base_model.fit(X, y) gc.collect() initial_baseline_mem = process.memory_info().rss for i in range(5): base_model.fit(X, y) gc.collect() final_mem = process.memory_info().rss # Some tiny allocations may occur, but we shuld not leak # without bounds, which previously happened assert (final_mem - initial_baseline_mem) < 2e6
def test_degenerate_cases(): n_samples = 100 cuml_model = curfr(max_features=1.0, max_samples=0.1, n_bins=128, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=10, max_leaves=-1, max_depth=16, accuracy_metric="mse") # Attempt to import un-fitted model with pytest.raises(NotFittedError): TreeExplainer(model=cuml_model) # Depth 0 trees rng = np.random.default_rng(seed=0) X = rng.standard_normal(size=(n_samples, 8), dtype=np.float32) y = np.ones(shape=(n_samples, ), dtype=np.float32) cuml_model.fit(X, y) explainer = TreeExplainer(model=cuml_model) out = explainer.shap_values(X) # Since the output is always 1.0 no matter the input, SHAP values for all # features are zero, as feature values don't have any effect on the output. # The bias (expected_value) is 1.0. assert np.all(out == 0) assert explainer.expected_value == 1.0
def test_rf_regression_float64(large_reg, datatype): X, y = large_reg X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=True) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(n_bins=16, split_criterion=2, min_rows_per_node=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() input_type = 'numpy' fil_model_preds = fil_model.predict(X_test, output_type=input_type) fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=40, min_samples_split=2, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07)
def test_concat_memory_leak(large_clf, estimator_type): import gc import os try: import psutil except ImportError: pytest.skip("psutil not installed") process = psutil.Process(os.getpid()) X, y = large_clf X = X.astype(np.float32) # Build a series of RF models n_models = 10 if estimator_type == 'classification': base_models = [ curfc(max_depth=10, n_estimators=100, random_state=123) for i in range(n_models) ] y = y.astype(np.int32) elif estimator_type == 'regression': base_models = [ curfr(max_depth=10, n_estimators=100, random_state=123) for i in range(n_models) ] y = y.astype(np.float32) else: assert False # Pre-fit once - this is our baseline and memory usage # should not significantly exceed it after later fits for model in base_models: model.fit(X, y) # Just concatenate over and over in a loop concat_models = base_models[1:] init_model = base_models[0] other_handles = [ model._obtain_treelite_handle() for model in concat_models ] init_model._concatenate_treelite_handle(other_handles) gc.collect() initial_baseline_mem = process.memory_info().rss for i in range(10): init_model._concatenate_treelite_handle(other_handles) gc.collect() used_mem = process.memory_info().rss logger.debug("memory at rep %2d: %d m" % (i, (used_mem - initial_baseline_mem) / 1e6)) gc.collect() used_mem = process.memory_info().rss logger.info("Final memory delta: %d" % ((used_mem - initial_baseline_mem) / 1e6)) assert (used_mem - initial_baseline_mem) < 1e6
def test_rf_regression( special_reg, datatype, max_features, max_samples, n_bins ): use_handle = True X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr( max_features=max_features, max_samples=max_samples, n_bins=n_bins, split_criterion=2, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric="mse", ) cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress" sk_model = skrfr( n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_regression_float64(large_reg, datatype): X, y = large_reg X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction if datatype[0] == np.float32: fil_preds = cuml_model.predict( X_test, predict_model="GPU", convert_dtype=True ) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02) # because datatype[0] != np.float32 or datatype[0] != datatype[1] # display warning when GPU-predict cannot be used and revert to CPU-predict elif datatype[1] == np.float64: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") fil_preds = cuml_model.predict( X_test, predict_model="GPU" ) assert("GPU based predict only accepts " "np.float32 data. The model was " "trained on np.float64 data hence " "cannot use GPU-based prediction! " "\nDefaulting to CPU-based Prediction. " "\nTo predict on float-64 data, set " "parameter predict_model = 'CPU'" in str(w[-1].message))
def test_rf_regressor_gtil_integration(tmpdir): X, y = load_boston(return_X_y=True) X, y = X.astype(np.float32), y.astype(np.float32) clf = curfr(max_depth=3, random_state=0, n_estimators=10) clf.fit(X, y) expected_pred = clf.predict(X) checkpoint_path = os.path.join(tmpdir, 'checkpoint.tl') clf.convert_to_treelite_model().to_treelite_checkpoint(checkpoint_path) tl_model = treelite.Model.deserialize(checkpoint_path) out_pred = treelite.gtil.predict(tl_model, X) np.testing.assert_almost_equal(out_pred, expected_pred, decimal=5)
def test_rf_regression(datatype, split_algo, rows_sample, n_info, mode, ncols, max_features): use_handle = True if mode == 'unit': X, y = make_regression(n_samples=100, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) train_rows = np.int32(X.shape[0]*0.8) X_test = np.asarray(X[train_rows:, :]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(datatype) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(datatype) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=8) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds) fil_r2 = r2_score(y_test, fil_preds) # Initialize, fit and predict using # sklearn's random forest regression model sk_model = skrfr(n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_predict) print(fil_r2, cu_r2, sk_r2) assert fil_r2 >= (cu_r2 - 0.02) assert fil_r2 >= (sk_r2 - 0.07)
def test_rf_regression_with_identical_labels(split_criterion, use_experimental_backend): X = np.array([[-1, 0], [0, 1], [2, 0], [0, 3], [-2, 0]], dtype=np.float32) y = np.array([1, 1, 1, 1, 1], dtype=np.float32) # Degenerate case: all labels are identical. # RF Regressor must not create any split. It must yield an empty tree # with only the root node. clf = curfr(max_features=1.0, rows_sample=1.0, n_bins=5, split_algo=1, bootstrap=False, split_criterion=split_criterion, min_samples_leaf=1, min_samples_split=2, random_state=0, n_streams=1, n_estimators=1, max_depth=1, use_experimental_backend=use_experimental_backend) clf.fit(X, y) model_dump = json.loads(clf.get_json()) assert len(model_dump) == 1 assert model_dump[0] == {'nodeid': 0, 'leaf_value': 1.0}
def test_rf_regression_float64(datatype, column_info, nrows, convert_dtype): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if nrows < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction if datatype[0] == np.float32 and convert_dtype: fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02) else: with pytest.raises(TypeError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype)
def test_rf_regression(datatype, use_handle, split_algo, n_info, mode, ncols, rows_sample): if mode == 'unit': X, y = make_regression(n_samples=30, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) train_rows = np.int32(X.shape[0]*0.8) X_test = np.asarray(X[train_rows:, :]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(datatype) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(datatype) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr(max_features=1.0, rows_sample=rows_sample, n_bins=8, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, n_estimators=50, handle=handle, max_leaves=-1, max_depth=25, accuracy_metric='mse') cuml_model.fit(X_train, y_train) cu_mse = cuml_model.score(X_test, y_test) if mode != 'stress': # sklearn random forest classification model # initialization, fit and predict sk_model = skrfr(n_estimators=50, max_depth=50, min_samples_split=2, max_features=1.0, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_mse = mean_squared_error(y_test, sk_predict) # compare the accuracy of the two models assert cu_mse <= (sk_mse + 0.07)
def test_rf_regression_default(datatype, column_info, nrows): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # score function should be equivalent score_mse = cuml_model.score(X_test, y_test, predict_model="GPU") sk_mse = mean_squared_error(y_test, fil_preds) assert sk_mse == pytest.approx(score_mse) # Initialize, fit and predict using # sklearn's random forest regression model if nrows < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) # XXX Accuracy gap exists with default parameters, requires # further investigation for next release assert fil_r2 >= (sk_r2 - 0.08) assert fil_r2 >= (cu_r2 - 0.02)
def test_cuml_rf_regressor(input_type): n_samples = 100 X, y = make_regression(n_samples=n_samples, n_features=8, n_informative=8, n_targets=1, random_state=2021) X, y = X.astype(np.float32), y.astype(np.float32) if input_type == 'cupy': X, y = cp.array(X), cp.array(y) elif input_type == 'cudf': X, y = cudf.DataFrame(X), cudf.Series(y) cuml_model = curfr(max_features=1.0, max_samples=0.1, n_bins=128, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=10, max_leaves=-1, max_depth=16, accuracy_metric="mse") cuml_model.fit(X, y) pred = cuml_model.predict(X) explainer = TreeExplainer(model=cuml_model) out = explainer.shap_values(X) if input_type == 'cupy': pred = pred.get() out = out.get() expected_value = explainer.expected_value.get() elif input_type == 'cudf': pred = pred.to_numpy() out = out.get() expected_value = explainer.expected_value.get() else: expected_value = explainer.expected_value # SHAP values should add up to predicted score shap_sum = np.sum(out, axis=1) + expected_value np.testing.assert_almost_equal(shap_sum, pred, decimal=4)
def test_multiple_fits_regression(column_info, nrows, n_estimators, n_bins): datatype = np.float32 ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(np.int32) cuml_model = curfr(n_bins=n_bins, n_estimators=n_estimators, max_depth=10) # Calling multiple fits cuml_model.fit(X, y) cuml_model.fit(X, y) cuml_model.fit(X, y) # Check if params are still intact params = cuml_model.get_params() assert params['n_estimators'] == n_estimators assert params['n_bins'] == n_bins
def test_rf_regression_with_identical_labels(split_criterion): X = np.array([[-1, 0], [0, 1], [2, 0], [0, 3], [-2, 0]], dtype=np.float32) y = np.array([1, 1, 1, 1, 1], dtype=np.float32) # Degenerate case: all labels are identical. # RF Regressor must not create any split. It must yield an empty tree # with only the root node. clf = curfr( max_features=1.0, max_samples=1.0, n_bins=5, bootstrap=False, split_criterion=split_criterion, min_samples_leaf=1, min_samples_split=2, random_state=0, n_streams=1, n_estimators=1, max_depth=1, ) clf.fit(X, y) model_dump = json.loads(clf.get_json()) assert len(model_dump) == 1 expected_dump = {"nodeid": 0, "leaf_value": [1.0], "instance_count": 5} assert model_dump[0] == expected_dump
def test_rf_get_json(estimator_type, max_depth, n_estimators): X, y = make_classification(n_samples=350, n_features=20, n_clusters_per_class=1, n_informative=10, random_state=123, n_classes=2) X = X.astype(np.float32) if estimator_type == 'classification': cuml_model = curfc(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=0, split_criterion=0, min_samples_leaf=2, seed=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) elif estimator_type == 'regression': cuml_model = curfr(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=0, min_samples_leaf=2, seed=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.float32) else: assert False # Train model on the data cuml_model.fit(X, y) json_out = cuml_model.get_json() json_obj = json.loads(json_out) # Test 1: Output is non-zero assert '' != json_out # Test 2: JSON object contains correct number of trees assert isinstance(json_obj, list) assert len(json_obj) == n_estimators # Test 3: Traverse JSON trees and get the same predictions as cuML RF def predict_with_json_tree(tree, x): if 'children' not in tree: assert 'leaf_value' in tree return tree['leaf_value'] assert 'split_feature' in tree assert 'split_threshold' in tree assert 'yes' in tree assert 'no' in tree if x[tree['split_feature']] <= tree['split_threshold']: return predict_with_json_tree(tree['children'][0], x) return predict_with_json_tree(tree['children'][1], x) def predict_with_json_rf_classifier(rf, x): # Returns the class with the highest vote. If there is a tie, return # the list of all classes with the highest vote. vote = [] for tree in rf: vote.append(predict_with_json_tree(tree, x)) vote = np.bincount(vote) max_vote = np.max(vote) majority_vote = np.nonzero(np.equal(vote, max_vote))[0] return majority_vote def predict_with_json_rf_regressor(rf, x): pred = 0. for tree in rf: pred += predict_with_json_tree(tree, x) return pred / len(rf) if estimator_type == 'classification': expected_pred = cuml_model.predict(X).astype(np.int32) for idx, row in enumerate(X): majority_vote = predict_with_json_rf_classifier(json_obj, row) assert expected_pred[idx] in majority_vote elif estimator_type == 'regression': expected_pred = cuml_model.predict(X).astype(np.float32) pred = [] for idx, row in enumerate(X): pred.append(predict_with_json_rf_regressor(json_obj, row)) pred = np.array(pred, dtype=np.float32) np.testing.assert_almost_equal(pred, expected_pred, decimal=6)
def test_rf_regression_sparse(datatype, split_algo, mode, column_info, max_features, rows_sample, fil_sparse_format, algo): ncols, n_info = column_info use_handle = True num_treees = 50 if mode == 'unit': X, y = make_regression(n_samples=500, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric='mse') cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) # predict using FIL if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() input_type = 'numpy' fil_model_preds = fil_model.predict(X_test, output_type=input_type) fil_model_preds = np.reshape(fil_model_preds, np.shape(cu_preds)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert ncols == tl_model.num_features del tl_model # Initialize, fit and predict using # sklearn's random forest regression model if mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=40, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_get_json(estimator_type, max_depth, n_estimators): X, y = make_classification( n_samples=350, n_features=20, n_clusters_per_class=1, n_informative=10, random_state=123, n_classes=2, ) X = X.astype(np.float32) if estimator_type == "classification": cuml_model = curfc( max_features=1.0, max_samples=1.0, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth, ) y = y.astype(np.int32) elif estimator_type == "regression": cuml_model = curfr( max_features=1.0, max_samples=1.0, n_bins=16, min_samples_leaf=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth, ) y = y.astype(np.float32) else: assert False # Train model on the data cuml_model.fit(X, y) json_out = cuml_model.get_json() json_obj = json.loads(json_out) # Test 1: Output is non-zero assert "" != json_out # Test 2: JSON object contains correct number of trees assert isinstance(json_obj, list) assert len(json_obj) == n_estimators # Test 3: Traverse JSON trees and get the same predictions as cuML RF def predict_with_json_tree(tree, x): if "children" not in tree: assert "leaf_value" in tree return tree["leaf_value"] assert "split_feature" in tree assert "split_threshold" in tree assert "yes" in tree assert "no" in tree if x[tree["split_feature"]] <= tree["split_threshold"] + 1e-5: return predict_with_json_tree(tree["children"][0], x) return predict_with_json_tree(tree["children"][1], x) def predict_with_json_rf_classifier(rf, x): # Returns the class with the highest vote. If there is a tie, return # the list of all classes with the highest vote. predictions = [] for tree in rf: predictions.append(np.array(predict_with_json_tree(tree, x))) predictions = np.sum(predictions, axis=0) return np.argmax(predictions) def predict_with_json_rf_regressor(rf, x): pred = 0.0 for tree in rf: pred += predict_with_json_tree(tree, x)[0] return pred / len(rf) if estimator_type == "classification": expected_pred = cuml_model.predict(X).astype(np.int32) for idx, row in enumerate(X): majority_vote = predict_with_json_rf_classifier(json_obj, row) assert expected_pred[idx] == majority_vote elif estimator_type == "regression": expected_pred = cuml_model.predict(X).astype(np.float32) pred = [] for idx, row in enumerate(X): pred.append(predict_with_json_rf_regressor(json_obj, row)) pred = np.array(pred, dtype=np.float32) print(json_obj) for i in range(len(pred)): assert np.isclose(pred[i], expected_pred[i]), X[i, 19] np.testing.assert_almost_equal(pred, expected_pred, decimal=6)
def test_rf_regression(datatype, split_algo, mode, column_info, max_features, rows_sample): ncols, n_info = column_info use_handle = True if mode == 'unit': X, y = make_regression(n_samples=500, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_predict, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)