def test_rf_host_memory_leak(large_clf, estimator_type): import gc import os try: import psutil except ImportError: pytest.skip("psutil not installed") process = psutil.Process(os.getpid()) X, y = large_clf X = X.astype(np.float32) if estimator_type == 'classification': base_model = curfc(max_depth=10, n_estimators=100, seed=123) y = y.astype(np.int32) else: base_model = curfr(max_depth=10, n_estimators=100, seed=123) y = y.astype(np.float32) # Pre-fit once - this is our baseline and memory usage # should not significantly exceed it after later fits base_model.fit(X, y) gc.collect() initial_baseline_mem = process.memory_info().rss for i in range(5): base_model.fit(X, y) gc.collect() final_mem = process.memory_info().rss # Some tiny allocations may occur, but we shuld not leak # without bounds, which previously happened assert (final_mem - initial_baseline_mem) < 2e6
def test_cuml_classifier(self): """ Validate cuML classifier with wrapper """ # NOTE: this is currently untested as I wasn't able to install cuML X, y = make_classification(n_samples=400, n_features=10, n_informative=2, n_redundant=3, n_classes=2, n_clusters_per_class=2, random_state=8311982) X_train, X_test, y_train, y_test = tts(X, y) # Convert to cudf dataframes X_train = cudf.DataFrame(X_train) y_train = cudf.Series(y_train) X_test = cudf.DataFrame(X_test) y_test = cudf.Series(y_test) model = classifier(curfc(n_estimators=40, max_depth=8, max_features=1)) oz = classification_report(model, X_train, y_train, X_test, y_test, show=False) assert is_fitted(oz)
def test_rf_classification_default(datatype, column_info, nrows): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=0, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_acc = accuracy_score(y_test, fil_preds) cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if nrows < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cu_acc - 0.02)
def test_cuml_rf_classifier(n_classes): n_samples = 100 X, y = make_classification(n_samples=n_samples, n_features=8, n_informative=8, n_redundant=0, n_repeated=0, n_classes=n_classes, random_state=2021) X, y = X.astype(np.float32), y.astype(np.float32) cuml_model = curfc(max_features=1.0, max_samples=0.1, n_bins=128, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=10, max_leaves=-1, max_depth=16, accuracy_metric="mse") cuml_model.fit(X, y) with pytest.raises(RuntimeError): # cuML RF classifier is not supported yet explainer = TreeExplainer(model=cuml_model) explainer.shap_values(X)
def test_accuracy(nrows, ncols, n_info, datatype): use_handle = True train_rows = np.int32(nrows * 0.8) X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=5) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(np.int32) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=1.0, n_bins=8, split_algo=0, split_criterion=0, min_rows_per_node=2, n_estimators=40, handle=handle, max_leaves=-1, max_depth=-1) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) cu_acc = cu_acc_score(y_test, cu_predict) cu_acc_using_sk = sk_acc_score(y_test, cu_predict) # compare the accuracy of the two models assert array_equal(cu_acc, cu_acc_using_sk)
def test_create_classification_model(max_features, max_depth, n_estimators, n_bins): # random forest classification model cuml_model = curfc(max_features=max_features, n_bins=n_bins, n_estimators=n_estimators, max_depth=max_depth) params = cuml_model.get_params() cuml_model2 = curfc() cuml_model2.set_params(**params) verfiy_params = cuml_model2.get_params() assert params['max_features'] == verfiy_params['max_features'] assert params['max_depth'] == verfiy_params['max_depth'] assert params['n_estimators'] == verfiy_params['n_estimators'] assert params['n_bins'] == verfiy_params['n_bins']
def test_rf_classification_seed(datatype, column_info, nrows): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=0, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) for i in range(20): seed = random.randint(100, 1e5) # Initialize, fit and predict using cuML's # random forest classification model cu_class = curfc(seed=seed, n_streams=1) cu_class.fit(X_train, y_train) # predict using FIL fil_preds_orig = cu_class.predict(X_test, predict_model="GPU") cu_preds_orig = cu_class.predict(X_test, predict_model="CPU") cu_acc_orig = accuracy_score(y_test, cu_preds_orig) fil_preds_orig = np.reshape(fil_preds_orig, np.shape(cu_preds_orig)) fil_acc_orig = accuracy_score(y_test, fil_preds_orig) # Initialize, fit and predict using cuML's # random forest classification model cu_class2 = curfc(seed=seed, n_streams=1) cu_class2.fit(X_train, y_train) # predict using FIL fil_preds_rerun = cu_class2.predict(X_test, predict_model="GPU") cu_preds_rerun = cu_class2.predict(X_test, predict_model="CPU") cu_acc_rerun = accuracy_score(y_test, cu_preds_rerun) fil_preds_rerun = np.reshape(fil_preds_rerun, np.shape(cu_preds_rerun)) fil_acc_rerun = accuracy_score(y_test, fil_preds_rerun) assert fil_acc_orig == fil_acc_rerun assert cu_acc_orig == cu_acc_rerun assert (fil_preds_orig == fil_preds_rerun).all() assert (cu_preds_orig == cu_preds_rerun).all()
def test_concat_memory_leak(large_clf, estimator_type): import gc import os try: import psutil except ImportError: pytest.skip("psutil not installed") process = psutil.Process(os.getpid()) X, y = large_clf X = X.astype(np.float32) # Build a series of RF models n_models = 10 if estimator_type == 'classification': base_models = [ curfc(max_depth=10, n_estimators=100, random_state=123) for i in range(n_models) ] y = y.astype(np.int32) elif estimator_type == 'regression': base_models = [ curfr(max_depth=10, n_estimators=100, random_state=123) for i in range(n_models) ] y = y.astype(np.float32) else: assert False # Pre-fit once - this is our baseline and memory usage # should not significantly exceed it after later fits for model in base_models: model.fit(X, y) # Just concatenate over and over in a loop concat_models = base_models[1:] init_model = base_models[0] other_handles = [ model._obtain_treelite_handle() for model in concat_models ] init_model._concatenate_treelite_handle(other_handles) gc.collect() initial_baseline_mem = process.memory_info().rss for i in range(10): init_model._concatenate_treelite_handle(other_handles) gc.collect() used_mem = process.memory_info().rss logger.debug("memory at rep %2d: %d m" % (i, (used_mem - initial_baseline_mem) / 1e6)) gc.collect() used_mem = process.memory_info().rss logger.info("Final memory delta: %d" % ((used_mem - initial_baseline_mem) / 1e6)) assert (used_mem - initial_baseline_mem) < 1e6
def test_rf_classification(small_clf, datatype, split_algo, max_samples, max_features, use_experimental_backend): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, max_samples=max_samples, n_bins=16, split_algo=split_algo, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16, use_experimental_backend=use_experimental_backend) f = io.StringIO() with redirect_stdout(f): cuml_model.fit(X_train, y_train) captured_stdout = f.getvalue() if use_experimental_backend: is_fallback_used = False if split_algo != 1: assert ('Experimental backend does not yet support histogram ' + 'split algorithm' in captured_stdout) is_fallback_used = True if is_fallback_used: assert ('Not using the experimental backend due to above ' + 'mentioned reason(s)' in captured_stdout) else: assert ('Using experimental backend for growing trees' in captured_stdout) else: assert captured_stdout == '' fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='auto') cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def rf_classification(datatype, array_type, max_features, max_samples, fixture): X, y = fixture X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) handle, stream = get_handle(True, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, max_samples=max_samples, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) if array_type == 'dataframe': X_train_df = cudf.DataFrame(X_train) y_train_df = cudf.Series(y_train) X_test_df = cudf.DataFrame(X_test) cuml_model.fit(X_train_df, y_train_df) cu_proba_gpu = np.array(cuml_model.predict_proba(X_test_df) .as_gpu_matrix()) cu_preds_cpu = cuml_model.predict(X_test_df, predict_model="CPU").to_array() cu_preds_gpu = cuml_model.predict(X_test_df, predict_model="GPU").to_array() else: cuml_model.fit(X_train, y_train) cu_proba_gpu = cuml_model.predict_proba(X_test) cu_preds_cpu = cuml_model.predict(X_test, predict_model="CPU") cu_preds_gpu = cuml_model.predict(X_test, predict_model="GPU") np.testing.assert_array_equal(cu_preds_gpu, np.argmax(cu_proba_gpu, axis=1)) cu_acc_cpu = accuracy_score(y_test, cu_preds_cpu) cu_acc_gpu = accuracy_score(y_test, cu_preds_gpu) assert cu_acc_cpu == pytest.approx(cu_acc_gpu, abs=0.01, rel=0.1) # sklearn random forest classification model # initialization, fit and predict if y.size < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) sk_proba = sk_model.predict_proba(X_test) assert cu_acc_cpu >= sk_acc - 0.07 assert cu_acc_gpu >= sk_acc - 0.07 # 0.06 is the highest relative error observed on CI, within # 0.0061 absolute error boundaries seen previously check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)
def test_rf_classification_seed(small_clf, datatype): X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) for i in range(8): seed = random.randint(100, 1e5) # Initialize, fit and predict using cuML's # random forest classification model cu_class = curfc(random_state=seed, n_streams=1) cu_class.fit(X_train, y_train) # predict using FIL fil_preds_orig = cu_class.predict(X_test, predict_model="GPU") cu_preds_orig = cu_class.predict(X_test, predict_model="CPU") cu_acc_orig = accuracy_score(y_test, cu_preds_orig) fil_preds_orig = np.reshape(fil_preds_orig, np.shape(cu_preds_orig)) fil_acc_orig = accuracy_score(y_test, fil_preds_orig) # Initialize, fit and predict using cuML's # random forest classification model cu_class2 = curfc(random_state=seed, n_streams=1) cu_class2.fit(X_train, y_train) # predict using FIL fil_preds_rerun = cu_class2.predict(X_test, predict_model="GPU") cu_preds_rerun = cu_class2.predict(X_test, predict_model="CPU") cu_acc_rerun = accuracy_score(y_test, cu_preds_rerun) fil_preds_rerun = np.reshape(fil_preds_rerun, np.shape(cu_preds_rerun)) fil_acc_rerun = accuracy_score(y_test, fil_preds_rerun) assert fil_acc_orig == fil_acc_rerun assert cu_acc_orig == cu_acc_rerun assert (fil_preds_orig == fil_preds_rerun).all() assert (cu_preds_orig == cu_preds_rerun).all()
def _setup(self, config): [X_train, X_test, y_train, y_test] = get_pinned_object(data_id) self.cuml_model = curfc(n_estimators=config.get("estimators", 40), max_depth=config.get("depth", 16), max_features=1.0) self.X_cudf_train = cudf.DataFrame.from_pandas(X_train) self.X_cudf_test = cudf.DataFrame.from_pandas(X_test) self.y_cudf_train = cudf.Series(y_train.values) self.y_test = y_test
def test_rf_classification_proba(datatype, split_algo, rows_sample, nrows, column_info, max_features): use_handle = True ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds_proba = cuml_model.predict_proba(X_test, output_class=True, threshold=0.5, algo='auto') y_proba = np.zeros(np.shape(fil_preds_proba)) y_proba[:, 1] = y_test y_proba[:, 0] = 1.0 - y_test fil_mse = mean_squared_error(y_proba, fil_preds_proba) if nrows < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds_proba = sk_model.predict_proba(X_test) sk_mse = mean_squared_error(y_proba, sk_preds_proba) # Max difference of 0.0061 is seen between the mse values of # predict proba function of fil and sklearn assert fil_mse <= (sk_mse + 0.0061)
def test_rf_classification(datatype, split_algo, rows_sample, nrows, column_info, max_features): use_handle = True ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, seed=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='BATCH_TREE_REORG') cu_predict = cuml_model.predict(X_test, predict_model="CPU") cuml_acc = accuracy_score(y_test, cu_predict) fil_acc = accuracy_score(y_test, fil_preds) if nrows < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def test_rf_nbins_small(small_clf): X, y = small_clf X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train[0:3, :], y_train[0:3])
def test_rf_classification(small_clf, datatype, max_samples, max_features): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc( max_features=max_features, max_samples=max_samples, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16, ) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", threshold=0.5, algo="auto") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc( n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= ( cuml_acc - 0.07 ) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
def test_rf_memory_leakage(fil_sparse_format, column_info, nrows): n_iter = 30 datatype = np.float32 use_handle = True ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=0, n_classes=2) X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Warmup. Some modules that are used in RF allocate space on the device # and consume memory. This is to make sure that the allocation is done # before the first call to get_memory_info. base_model = curfc(handle=handle) base_model.fit(X_train, y_train) free_mem = cuda.current_context().get_memory_info()[0] rfc_model = curfc(handle=handle) rfc_model.fit(X_train, y_train) # Calculate the memory free after fitting the cuML RF model delta_mem = free_mem - cuda.current_context().get_memory_info()[0] cuml_mods = curfc(handle=handle) cuml_mods.fit(X_train, y_train) for i in range(n_iter): cuml_mods.predict(X_train, predict_model="GPU") handle.sync() delta_mem = free_mem - cuda.current_context().get_memory_info()[0] assert delta_mem == 0
def test_rf_multiclass_classifier_gtil_integration(tmpdir): X, y = load_iris(return_X_y=True) X, y = X.astype(np.float32), y.astype(np.int32) clf = curfc(max_depth=3, random_state=0, n_estimators=10) clf.fit(X, y) expected_prob = clf.predict_proba(X) checkpoint_path = os.path.join(tmpdir, 'checkpoint.tl') clf.convert_to_treelite_model().to_treelite_checkpoint(checkpoint_path) tl_model = treelite.Model.deserialize(checkpoint_path) out_prob = treelite.gtil.predict(tl_model, X, pred_margin=True) np.testing.assert_almost_equal(out_prob, expected_prob, decimal=5)
def test_rf_printing(capfd, n_estimators, detailed_printing): X, y = make_classification(n_samples=500, n_features=10, n_clusters_per_class=1, n_informative=5, random_state=94929, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(True, n_streams=1) # Initialize cuML Random Forest classification model cuml_model = curfc(handle=handle, max_features=1.0, rows_sample=1.0, n_bins=16, split_algo=0, split_criterion=0, min_rows_per_node=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=16) # Train model on the data cuml_model.fit(X, y) if detailed_printing: cuml_model.print_detailed() else: cuml_model.print_summary() # Read the captured output printed_output = capfd.readouterr().out # Test 1: Output is non-zero assert '' != printed_output # Count the number of trees printed tree_count = 0 for line in printed_output.split('\n'): if line.strip().startswith('Tree #'): tree_count += 1 # Test 2: Correct number of trees are printed assert n_estimators == tree_count
def test_rf_classification_float64(small_clf, datatype, convert_dtype): X, y = small_clf X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) X_test = X_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if X.shape[0] < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert cu_acc >= (sk_acc - 0.07) # predict using cuML's GPU based prediction if datatype[0] == np.float32 and convert_dtype: fil_preds = cuml_model.predict( X_test, predict_model="GPU", convert_dtype=convert_dtype ) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_acc = accuracy_score(y_test, fil_preds) assert fil_acc >= (cu_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa # if GPU predict cannot be used, display warning and use CPU predict elif datatype[1] == np.float64: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") fil_preds = cuml_model.predict( X_test, predict_model="GPU", convert_dtype=convert_dtype ) assert("GPU based predict only accepts " "np.float32 data. The model was " "trained on np.float64 data hence " "cannot use GPU-based prediction! " "\nDefaulting to CPU-based Prediction. " "\nTo predict on float-64 data, set " "parameter predict_model = 'CPU'" in str(w[-1].message))
def test_rf_get_text(n_estimators, detailed_text): X, y = make_classification( n_samples=500, n_features=10, n_clusters_per_class=1, n_informative=5, random_state=94929, n_classes=2, ) X = X.astype(np.float32) y = y.astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(True, n_streams=1) # Initialize cuML Random Forest classification model cuml_model = curfc( handle=handle, max_features=1.0, max_samples=1.0, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=16, ) # Train model on the data cuml_model.fit(X, y) if detailed_text: text_output = cuml_model.get_detailed_text() else: text_output = cuml_model.get_summary_text() # Test 1: Output is non-zero assert "" != text_output # Count the number of trees printed tree_count = 0 for line in text_output.split("\n"): if line.strip().startswith("Tree #"): tree_count += 1 # Test 2: Correct number of trees are printed assert n_estimators == tree_count
def test_rf_classification(small_clf, datatype, split_algo, rows_sample, max_features): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=max_features, rows_sample=rows_sample, n_bins=16, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", output_class=True, threshold=0.5, algo='auto') cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc(n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= (cuml_acc - 0.02)
def test_for_memory_leak(): cuml_mods = curfc(handle=handle) cuml_mods.fit(X_train, y_train) handle.sync() # just to be sure # Calculate the memory free after fitting the cuML model delta_mem = free_mem - cuda.current_context().get_memory_info()[0] assert delta_mem == 0 for i in range(2): cuml_mods.predict(X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format) handle.sync() # just to be sure # Calculate the memory free after predicting the cuML model delta_mem = free_mem - cuda.current_context().get_memory_info()[0] assert delta_mem == 0
def test_rf_classification_float64(datatype, column_info, nrows, convert_dtype): ncols, n_info = column_info X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=0, n_classes=2) X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() cuml_model.fit(X_train, y_train) cu_preds = cuml_model.predict(X_test, predict_model="CPU") cu_acc = accuracy_score(y_test, cu_preds) # sklearn random forest classification model # initialization, fit and predict if nrows < 500000: sk_model = skrfc(max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert cu_acc >= (sk_acc - 0.07) # predict using cuML's GPU based prediction if datatype[0] == np.float32 and convert_dtype: fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype) fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) fil_acc = accuracy_score(y_test, fil_preds) assert fil_acc >= (cu_acc - 0.02) else: with pytest.raises(TypeError): fil_preds = cuml_model.predict(X_test, predict_model="GPU", convert_dtype=convert_dtype)
def test_multiple_fits_classification(large_clf, n_estimators, n_bins): datatype = np.float32 X, y = large_clf X = X.astype(datatype) y = y.astype(np.int32) cuml_model = curfc(n_bins=n_bins, n_estimators=n_estimators, max_depth=10) # Calling multiple fits cuml_model.fit(X, y) cuml_model.fit(X, y) # Check if params are still intact params = cuml_model.get_params() assert params['n_estimators'] == n_estimators assert params['n_bins'] == n_bins
def test_rf_predict_numpy(datatype, use_handle, split_algo, n_info, nrows, ncols): train_rows = np.int32(nrows * 0.8) X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=5) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(np.int32) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=1.0, n_bins=8, split_algo=split_algo, min_rows_per_node=2, n_estimators=30, handle=handle, max_leaves=-1) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) cu_acc = accuracy_score(y_test, cu_predict) if nrows < 500000: # sklearn random forest classification model # initialization, fit and predict sk_model = skrfc(n_estimators=40, max_depth=None, min_samples_split=2, max_features=1.0, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) # compare the accuracy of the two models assert cu_acc >= (sk_acc - 0.07)
def test_cuml_rf_classifier(n_classes, input_type): n_samples = 100 X, y = make_classification(n_samples=n_samples, n_features=8, n_informative=8, n_redundant=0, n_repeated=0, n_classes=n_classes, random_state=2021) X, y = X.astype(np.float32), y.astype(np.float32) if input_type == 'cupy': X, y = cp.array(X), cp.array(y) elif input_type == 'cudf': X, y = cudf.DataFrame(X), cudf.Series(y) cuml_model = curfc(max_features=1.0, max_samples=0.1, n_bins=128, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=10, max_leaves=-1, max_depth=16, accuracy_metric="mse") cuml_model.fit(X, y) pred = cuml_model.predict_proba(X) explainer = TreeExplainer(model=cuml_model) out = explainer.shap_values(X) if input_type == 'cupy': pred = pred.get() out = out.get() expected_value = explainer.expected_value.get() elif input_type == 'cudf': pred = pred.to_numpy() out = out.get() expected_value = explainer.expected_value.get() else: expected_value = explainer.expected_value # SHAP values should add up to predicted score expected_value = expected_value.reshape(-1, 1) shap_sum = np.sum(out, axis=2) + np.tile(expected_value, (1, n_samples)) pred = np.transpose(pred, (1, 0)) np.testing.assert_almost_equal(shap_sum, pred, decimal=4)
def _train(self): iteration = getattr(self, "iteration", 0) # print(self._dataset) if compute == "GPU": # split data X_train, X_test, y_train, y_test = train_test_split( self._dataset.loc[:, self._dataset.columns != self._y_label], self._dataset[self._y_label], train_size=0.8, shuffle=True, random_state=42, ) self.rf_model = curfc( n_estimators=self._model_params["n_estimators"], max_depth=self._model_params["max_depth"], n_bins=self._model_params["n_bins"], max_features=self._model_params["max_features"], ) X_train = X_train.astype('float32') X_test = X_test.astype('float32') # train model with PerfTimer() as train_timer: trained_model = self.rf_model.fit(X_train, y_train.astype("float32")) training_time = train_timer.duration # evaluate perf with PerfTimer() as inference_timer: test_accuracy = r2_score(y_test.astype("float32"), trained_model.predict(X_test)) infer_time = inference_timer.duration # update best model [ assumes maximization of perf metric ] if test_accuracy > self._global_best_test_accuracy: self._global_best_test_accuracy = test_accuracy self._global_best_model = trained_model return { "test_accuracy": test_accuracy, "train_time": round(training_time, 4), "infer_time": round(infer_time, 4), "is_bad": not math.isfinite(test_accuracy), }
def test_rf_nbins_small(small_clf): X, y = small_clf X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.8, random_state=0 ) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc() # display warning when nbins less than samples with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") cuml_model.fit(X_train[0:3, :], y_train[0:3]) assert("The number of bins, `n_bins` is greater than " "the number of samples used for training. " "Changing `n_bins` to number of training samples." in str(w[-1].message))
def test_rf_classification(datatype, split_algo, n_info, nrows, ncols, max_depth, rows_sample): use_handle = True if split_algo == 1 and max_depth < 0: pytest.xfail("Unlimited depth not supported with quantile") train_rows = np.int32(nrows*0.8) X, y = make_classification(n_samples=nrows, n_features=ncols, n_clusters_per_class=1, n_informative=n_info, random_state=123, n_classes=5) X_test = np.asarray(X[train_rows:, 0:]).astype(datatype) y_test = np.asarray(y[train_rows:, ]).astype(np.int32) X_train = np.asarray(X[0:train_rows, :]).astype(datatype) y_train = np.asarray(y[0:train_rows, ]).astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(use_handle) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc(max_features=1.0, rows_sample=rows_sample, n_bins=8, split_algo=split_algo, split_criterion=0, min_rows_per_node=2, n_estimators=40, handle=handle, max_leaves=-1, max_depth=max_depth) cuml_model.fit(X_train, y_train) cu_predict = cuml_model.predict(X_test) cu_acc = accuracy_score(y_test, cu_predict) if nrows < 500000: # sklearn random forest classification model # initialization, fit and predict sk_model = skrfc(n_estimators=40, max_depth=(max_depth if max_depth > 0 else None), min_samples_split=2, max_features=1.0, random_state=10) sk_model.fit(X_train, y_train) sk_predict = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_predict) # compare the accuracy of the two models if max_depth > 1: assert cu_acc >= (sk_acc - 0.07)