def test_rf_regression_float64(large_reg, datatype): X, y = large_reg X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=True) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(n_bins=16, split_criterion=2, min_rows_per_node=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() input_type = 'numpy' fil_model_preds = fil_model.predict(X_test, output_type=input_type) fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=40, min_samples_split=2, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07)
def test_rf_regression( special_reg, datatype, max_features, max_samples, n_bins ): use_handle = True X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr( max_features=max_features, max_samples=max_samples, n_bins=n_bins, split_criterion=2, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric="mse", ) cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress" sk_model = skrfr( n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_regression_float64(large_reg, datatype): X, y = large_reg X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction if datatype[0] == np.float32: fil_preds = cuml_model.predict( X_test, predict_model="GPU", convert_dtype=True ) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02) # because datatype[0] != np.float32 or datatype[0] != datatype[1] # display warning when GPU-predict cannot be used and revert to CPU-predict elif datatype[1] == np.float64: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") fil_preds = cuml_model.predict( X_test, predict_model="GPU" ) assert("GPU based predict only accepts " "np.float32 data. The model was " "trained on np.float64 data hence " "cannot use GPU-based prediction! " "\nDefaulting to CPU-based Prediction. " "\nTo predict on float-64 data, set " "parameter predict_model = 'CPU'" in str(w[-1].message))
def test_rf_regression(datatype, split_algo, rows_sample, n_info, mode, ncols, max_features): use_handle = True if mode == 'unit': X, y = make_regression(n_samples=100, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) train_rows = np.int32(X.shape[0]*0.8) X_test = np.asarray(X[train_rows:, :]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(datatype) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(datatype) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=8) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds) fil_r2 = r2_score(y_test, fil_preds) # Initialize, fit and predict using # sklearn's random forest regression model sk_model = skrfr(n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_predict) print(fil_r2, cu_r2, sk_r2) assert fil_r2 >= (cu_r2 - 0.02) assert fil_r2 >= (sk_r2 - 0.07)
def test_rf_regression_float64(datatype, column_info, nrows, convert_dtype): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_train = X_train.astype(datatype[0]) y_train = y_train.astype(datatype[0]) X_test = X_test.astype(datatype[1]) y_test = y_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype[0]) # sklearn random forest classification model # initialization, fit and predict if nrows < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype[0]) assert cu_r2 >= (sk_r2 - 0.09) # predict using cuML's GPU based prediction if datatype[0] == np.float32 and convert_dtype: fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype[0]) assert fil_r2 >= (cu_r2 - 0.02) else: with pytest.raises(TypeError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype)
def test_rf_regression(datatype, use_handle, split_algo, n_info, mode, ncols, rows_sample): if mode == 'unit': X, y = make_regression(n_samples=30, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) train_rows = np.int32(X.shape[0]*0.8) X_test = np.asarray(X[train_rows:, :]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(datatype) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(datatype) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr(max_features=1.0, rows_sample=rows_sample, n_bins=8, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, n_estimators=50, handle=handle, max_leaves=-1, max_depth=25, accuracy_metric='mse') cuml_model.fit(X_train, y_train) cu_mse = cuml_model.score(X_test, y_test) if mode != 'stress': # sklearn random forest classification model # initialization, fit and predict sk_model = skrfr(n_estimators=50, max_depth=50, min_samples_split=2, max_features=1.0, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_mse = mean_squared_error(y_test, sk_predict) # compare the accuracy of the two models assert cu_mse <= (sk_mse + 0.07)
def test_rf_regression_default(datatype, column_info, nrows): ncols, n_info = column_info X, y = make_regression(n_samples=nrows, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfr() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # score function should be equivalent score_mse = cuml_model.score(X_test, y_test, predict_model="GPU") sk_mse = mean_squared_error(y_test, fil_preds) assert sk_mse == pytest.approx(score_mse) # Initialize, fit and predict using # sklearn's random forest regression model if nrows < 500000: sk_model = skrfr(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) # XXX Accuracy gap exists with default parameters, requires # further investigation for next release assert fil_r2 >= (sk_r2 - 0.08) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_regression(datatype, split_algo, mode, column_info, max_features, rows_sample): ncols, n_info = column_info use_handle = True if mode == 'unit': X, y = make_regression(n_samples=500, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric='mse') cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_predict, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_regression_sparse(datatype, split_algo, mode, column_info, max_features, rows_sample, fil_sparse_format, algo): ncols, n_info = column_info use_handle = True num_treees = 50 if mode == 'unit': X, y = make_regression(n_samples=500, n_features=ncols, n_informative=n_info, random_state=123) elif mode == 'quality': X, y = fetch_california_housing(return_X_y=True) else: X, y = make_regression(n_samples=100000, n_features=ncols, n_informative=n_info, random_state=123) X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=2, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric='mse') cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) # predict using FIL if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() input_type = 'numpy' fil_model_preds = fil_model.predict(X_test, output_type=input_type) fil_model_preds = np.reshape(fil_model_preds, np.shape(cu_preds)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert ncols == tl_model.num_features del tl_model # Initialize, fit and predict using # sklearn's random forest regression model if mode != "stress": sk_model = skrfr(n_estimators=50, max_depth=40, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)