def test_rf_classification_default(datatype, column_info, nrows): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=0, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_acc = accuracy_score(y_test, fil_preds) cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if nrows < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cu_acc - 0.02)
def test_rf_classification(small_clf, datatype, split_algo, max_samples, max_features, use_experimental_backend): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, max_samples=max_samples, n_bins=16, split_algo=split_algo, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16, use_experimental_backend=use_experimental_backend) f = io.StringIO() with redirect_stdout(f): cuml_model.fit(X_train, y_train) captured_stdout = f.getvalue() if use_experimental_backend: is_fallback_used = False if split_algo != 1: assert ('Experimental backend does not yet support histogram ' + 'split algorithm' in captured_stdout) is_fallback_used = True if is_fallback_used: assert ('Not using the experimental backend due to above ' + 'mentioned reason(s)' in captured_stdout) else: assert ('Using experimental backend for growing trees' in captured_stdout) else: assert captured_stdout == '' fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='auto') cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def rf_classification(datatype, array_type, max_features, max_samples, fixture): X, y = fixture X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) handle, stream = get_handle(True, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, max_samples=max_samples, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) if array_type == 'dataframe': X_train_df = cudf.DataFrame(X_train) y_train_df = cudf.Series(y_train) X_test_df = cudf.DataFrame(X_test) cuml_model.fit(X_train_df, y_train_df) cu_proba_gpu = np.array(cuml_model.predict_proba(X_test_df) .as_gpu_matrix()) cu_preds_cpu = cuml_model.predict(X_test_df, predict_model="CPU").to_array() cu_preds_gpu = cuml_model.predict(X_test_df, predict_model="GPU").to_array() else: cuml_model.fit(X_train, y_train) cu_proba_gpu = cuml_model.predict_proba(X_test) cu_preds_cpu = cuml_model.predict(X_test, predict_model="CPU") cu_preds_gpu = cuml_model.predict(X_test, predict_model="GPU") np.testing.assert_array_equal(cu_preds_gpu, np.argmax(cu_proba_gpu, axis=1)) cu_acc_cpu = accuracy_score(y_test, cu_preds_cpu) cu_acc_gpu = accuracy_score(y_test, cu_preds_gpu) assert cu_acc_cpu == pytest.approx(cu_acc_gpu, abs=0.01, rel=0.1) # sklearn random forest classification model # initialization, fit and predict if y.size < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) sk_proba = sk_model.predict_proba(X_test) assert cu_acc_cpu >= sk_acc - 0.07 assert cu_acc_gpu >= sk_acc - 0.07 # 0.06 is the highest relative error observed on CI, within # 0.0061 absolute error boundaries seen previously check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)
def test_rf_classification(datatype, split_algo, rows_sample, nrows, column_info, max_features): use_handle = True ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='BATCH_TREE_REORG') cu_predict = cuml_model.predict(X_test, predict_model="CPU") cuml_acc = accuracy_score(y_test, cu_predict) fil_acc = accuracy_score(y_test, fil_preds) if nrows < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def test_rf_classification_proba(datatype, split_algo, rows_sample, nrows, column_info, max_features): use_handle = True ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds_proba = cuml_model.predict_proba(X_test, output_class=True, threshold=0.5, algo='auto') y_proba = np.zeros(np.shape(fil_preds_proba)) y_proba[:, 1] = y_test y_proba[:, 0] = 1.0 - y_test fil_mse = mean_squared_error(y_proba, fil_preds_proba) if nrows < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds_proba = sk_model.predict_proba(X_test) sk_mse = mean_squared_error(y_proba, sk_preds_proba) # Max difference of 0.0061 is seen between the mse values of # predict proba function of fil and sklearn assert fil_mse <= (sk_mse + 0.0061)
def test_rf_classification(small_clf, datatype, max_samples, max_features): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc( max_features=max_features, max_samples=max_samples, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16, ) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", threshold=0.5, algo="auto") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc( n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= ( cuml_acc - 0.07 ) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
def test_rf_classification_dask_fil_predict_proba(partitions_per_worker, cluster): c = Client(cluster) try: X, y = make_classification(n_samples=1000, n_features=30, n_clusters_per_class=1, n_informative=20, random_state=123, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=100, random_state=123) cu_rf_params = { 'n_bins': 16, 'n_streams': 1, 'n_estimators': 40, 'max_depth': 16 } X_train_df, y_train_df = _prep_training_data(c, X_train, y_train, partitions_per_worker) X_test_df, _ = _prep_training_data(c, X_test, y_test, partitions_per_worker) cu_rf_mg = cuRFC_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) fil_preds_proba = cu_rf_mg.predict_proba(X_test_df).compute() fil_preds_proba = cp.asnumpy(fil_preds_proba.to_gpu_matrix()) y_proba = np.zeros(np.shape(fil_preds_proba)) y_proba[:, 1] = y_test y_proba[:, 0] = 1.0 - y_test fil_mse = mean_squared_error(y_proba, fil_preds_proba) sk_model = skrfc(n_estimators=40, max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds_proba = sk_model.predict_proba(X_test) sk_mse = mean_squared_error(y_proba, sk_preds_proba) # The threshold is required as the test would intermitently # fail with a max difference of 0.022 between the two mse values assert fil_mse <= sk_mse + 0.022 finally: c.close()
def test_rf_classification_float64(small_clf, datatype, convert_dtype): X, y = small_clf X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) X_test = X_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert cu_acc >= (sk_acc - 0.07) # predict using cuML's GPU based prediction if datatype[0] == np.float32 and convert_dtype: fil_preds = cuml_model.predict( X_test, predict_model="GPU", convert_dtype=convert_dtype ) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_acc = accuracy_score(y_test, fil_preds) assert fil_acc >= (cu_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa # if GPU predict cannot be used, display warning and use CPU predict elif datatype[1] == np.float64: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") fil_preds = cuml_model.predict( X_test, predict_model="GPU", convert_dtype=convert_dtype ) assert("GPU based predict only accepts " "np.float32 data. The model was " "trained on np.float64 data hence " "cannot use GPU-based prediction! " "\nDefaulting to CPU-based Prediction. " "\nTo predict on float-64 data, set " "parameter predict_model = 'CPU'" in str(w[-1].message))
def test_rf_classification_dask_fil_predict_proba(partitions_per_worker, client): n_workers = len(client.scheduler_info()['workers']) X, y = make_classification(n_samples=n_workers * 1500, n_features=30, n_clusters_per_class=1, n_informative=20, random_state=123, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=n_workers * 150, random_state=123) cu_rf_params = { 'n_bins': 16, 'n_streams': 1, 'n_estimators': 40, 'max_depth': 16 } X_train_df, y_train_df = _prep_training_data(client, X_train, y_train, partitions_per_worker) X_test_df, _ = _prep_training_data(client, X_test, y_test, partitions_per_worker) cu_rf_mg = cuRFC_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) fil_preds = cu_rf_mg.predict(X_test_df).compute() fil_preds = fil_preds.to_numpy() fil_preds_proba = cu_rf_mg.predict_proba(X_test_df).compute() fil_preds_proba = fil_preds_proba.to_numpy() np.testing.assert_equal(fil_preds, np.argmax(fil_preds_proba, axis=1)) y_proba = np.zeros(np.shape(fil_preds_proba)) y_proba[:, 1] = y_test y_proba[:, 0] = 1.0 - y_test fil_mse = mean_squared_error(y_proba, fil_preds_proba) sk_model = skrfc(n_estimators=40, max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds_proba = sk_model.predict_proba(X_test) sk_mse = mean_squared_error(y_proba, sk_preds_proba) # The threshold is required as the test would intermitently # fail with a max difference of 0.029 between the two mse values assert fil_mse <= sk_mse + 0.029
def test_rf_classification(small_clf, datatype, split_algo, rows_sample, max_features): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='auto') cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def test_rf_classification_float64(datatype, column_info, nrows, convert_dtype): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=0, n_classes=2) X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if nrows < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert cu_acc >= (sk_acc - 0.07) # predict using cuML's GPU based prediction if datatype[0] == np.float32 and convert_dtype: fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_acc = accuracy_score(y_test, fil_preds) assert fil_acc >= (cu_acc - 0.02) else: with pytest.raises(TypeError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype)
def test_rf_predict_numpy(datatype, use_handle, split_algo, n_info, nrows, ncols): train_rows = np.int32(nrows * 0.8) X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=5) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(np.int32) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=1.0, n_bins=8, split_algo=split_algo, min_rows_per_node=2, n_estimators=30, handle=handle, max_leaves=-1) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) cu_acc = accuracy_score(y_test, cu_predict) if nrows < 500000: # sklearn random forest classification model # initialization, fit and predict sk_model = skrfc(n_estimators=40, max_depth=None, min_samples_split=2, max_features=1.0, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) # compare the accuracy of the two models assert cu_acc >= (sk_acc - 0.07)
def test_rf_classification(datatype, split_algo, n_info, nrows, ncols, max_depth, rows_sample): use_handle = True if split_algo == 1 and max_depth < 0: pytest.xfail("Unlimited depth not supported with quantile") train_rows = np.int32(nrows*0.8) X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=5) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(np.int32) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=1.0, rows_sample=rows_sample, n_bins=8, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, n_estimators=40, handle=handle, max_leaves=-1, max_depth=max_depth) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) cu_acc = accuracy_score(y_test, cu_predict) if nrows < 500000: # sklearn random forest classification model # initialization, fit and predict sk_model = skrfc(n_estimators=40, max_depth=(max_depth if max_depth > 0 else None), min_samples_split=2, max_features=1.0, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) # compare the accuracy of the two models if max_depth > 1: assert cu_acc >= (sk_acc - 0.07)
def test_rf_classification_multi_class(datatype, column_info, nrows, n_classes, type): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=0, n_classes=n_classes) X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() if type == 'dataframe': X_train_df = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X_train)) y_train_df = cudf.Series(y_train) X_test_df = cudf.DataFrame.from_gpu_matrix(rmm.to_device(X_test)) cuml_model.fit(X_train_df, y_train_df) cu_preds = cuml_model.predict(X_test_df, predict_model="CPU").to_array() else: cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if nrows < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert cu_acc >= (sk_acc - 0.07)
def test_rf_classification_float64(small_clf, datatype, convert_dtype): X, y = small_clf X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert cu_acc >= (sk_acc - 0.07) # predict using cuML's GPU based prediction if datatype[0] == np.float32 and convert_dtype: fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_acc = accuracy_score(y_test, fil_preds) assert fil_acc >= (cu_acc - 0.02) else: with pytest.raises(TypeError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype)
def test_rf_classification_float64(small_clf, datatype, convert_dtype): X, y = small_clf X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert cu_acc >= (sk_acc - 0.07) # predict using cuML's GPU based prediction fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_acc = accuracy_score(y_test, fil_preds) assert fil_acc >= ( cu_acc - 0.07 ) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
def test_rf_classification_sparse(small_clf, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40) cuml_model.fit(X_train, y_train) if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_acc = accuracy_score(y_test, fil_preds) fil_model = cuml_model.convert_to_fil_model() with cuml.using_output_type("numpy"): fil_model_preds = fil_model.predict(X_test) fil_model_acc = accuracy_score(y_test, fil_model_preds) assert fil_acc == fil_model_acc tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features if X.shape[0] < 500000: sk_model = skrfc(n_estimators=50, max_depth=40, min_samples_split=2, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07)
def test_rf_classification_sparse(datatype, split_algo, rows_sample, nrows, column_info, max_features, fil_sparse_format, algo): use_handle = True ncols, n_info = column_info num_treees = 50 X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40) cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cuml_acc = accuracy_score(y_test, cu_preds) if ((not fil_sparse_format or algo == 'tree_reorg' or algo == 'batch_tree_reorg') or fil_sparse_format == 'not_supported'): with pytest.raises(ValueError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, fil_sparse_format=fil_sparse_format, algo=algo) else: fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, fil_sparse_format=fil_sparse_format, algo=algo) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_acc = accuracy_score(y_test, fil_preds) fil_model = cuml_model.convert_to_fil_model() input_type = 'numpy' fil_model_preds = fil_model.predict(X_test, output_type=input_type) fil_model_acc = accuracy_score(y_test, fil_model_preds) assert fil_acc == fil_model_acc tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert ncols == tl_model.num_features del tl_model if nrows < 500000: sk_model = skrfc(n_estimators=50, max_depth=40, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)