def test_single_input(client, model_type, ignore_empty_partitions): X, y = make_classification(n_samples=1) X = X.astype(np.float32) if model_type == 'classification': y = y.astype(np.int32) else: y = y.astype(np.float32) X, y = _prep_training_data(client, X, y, partitions_per_worker=2) if model_type == 'classification': cu_rf_mg = cuRFC_mg(n_bins=1, ignore_empty_partitions=ignore_empty_partitions) else: cu_rf_mg = cuRFR_mg(n_bins=1, ignore_empty_partitions=ignore_empty_partitions) if ignore_empty_partitions or \ len(client.scheduler_info()['workers'].keys()) == 1: cu_rf_mg.fit(X, y) cuml_mod_predict = cu_rf_mg.predict(X) cuml_mod_predict = cp.asnumpy(cp.array(cuml_mod_predict.compute())) y = cp.asnumpy(cp.array(y.compute())) acc_score = accuracy_score(cuml_mod_predict, y) assert acc_score == 1.0 else: with pytest.raises(ValueError): cu_rf_mg.fit(X, y)
def test_rf_classification_dask_cudf(partitions_per_worker, cluster): # Use CUDA_VISIBLE_DEVICES to control the number of workers c = Client(cluster) try: X, y = make_classification(n_samples=10000, n_features=20, n_clusters_per_class=1, n_informative=10, random_state=123, n_classes=5) X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 40, 'max_depth': 16, 'n_bins': 16, } X_train_df, y_train_df = _prep_training_data(c, X_train, y_train, partitions_per_worker) cuml_mod = cuRFC_mg(**cu_rf_params) cuml_mod.fit(X_train_df, y_train_df) cuml_mod_predict = cuml_mod.predict(X_test) acc_score = accuracy_score(cuml_mod_predict, y_test, normalize=True) assert acc_score > 0.8 finally: c.close()
def test_rf_classification_dask_array(partitions_per_worker, client, output_class): X, y = make_classification(n_samples=10000, n_features=30, n_clusters_per_class=1, n_informative=20, random_state=123, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 25, 'max_depth': 13, 'n_bins': 15, } X_train_df, y_train_df = _prep_training_data(client, X_train, y_train, partitions_per_worker) X_test_dask_array = from_array(X_test) cuml_mod = cuRFC_mg(**cu_rf_params) cuml_mod.fit(X_train_df, y_train_df) cuml_mod_predict = cuml_mod.predict(X_test_dask_array, output_class).compute() if not output_class: cuml_mod_predict = np.round(cuml_mod_predict) acc_score = accuracy_score(cuml_mod_predict, y_test, normalize=True) assert acc_score > 0.8
def test_rf_concatenation_dask(client, model_type): from cuml.fil.fil import TreeliteModel X, y = make_classification(n_samples=1000, n_features=30, random_state=123, n_classes=2) X = X.astype(np.float32) if model_type == 'classification': y = y.astype(np.int32) else: y = y.astype(np.float32) n_estimators = 40 cu_rf_params = {'n_estimators': n_estimators} X_df, y_df = _prep_training_data(client, X, y, partitions_per_worker=2) if model_type == 'classification': cu_rf_mg = cuRFC_mg(**cu_rf_params) else: cu_rf_mg = cuRFR_mg(**cu_rf_params) cu_rf_mg.fit(X_df, y_df) res1 = cu_rf_mg.predict(X_df) res1.compute() local_tl = TreeliteModel.from_treelite_model_handle( cu_rf_mg.internal_model._obtain_treelite_handle(), take_handle_ownership=False) assert local_tl.num_trees == n_estimators
def test_rf_broadcast(model_type, fit_broadcast, transform_broadcast, client): # Use CUDA_VISIBLE_DEVICES to control the number of workers workers = list(client.scheduler_info()['workers'].keys()) n_workers = len(workers) if model_type == 'classification': X, y = make_classification(n_samples=n_workers * 1000, n_features=20, n_informative=15, n_classes=4, n_clusters_per_class=1, random_state=123) y = y.astype(np.int32) else: X, y = make_regression(n_samples=n_workers * 1000, n_features=20, n_informative=5, random_state=123) y = y.astype(np.float32) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_workers * 100, random_state=123) X_train_df, y_train_df = _prep_training_data(client, X_train, y_train, 1) X_test_dask_array = from_array(X_test) if model_type == 'classification': cuml_mod = cuRFC_mg(n_estimators=10, max_depth=8, n_bins=16, ignore_empty_partitions=True) cuml_mod.fit(X_train_df, y_train_df, broadcast_data=fit_broadcast) cuml_mod_predict = cuml_mod.predict(X_test_dask_array, broadcast_data=transform_broadcast) cuml_mod_predict = cuml_mod_predict.compute() cuml_mod_predict = cp.asnumpy(cuml_mod_predict) acc_score = accuracy_score(cuml_mod_predict, y_test, normalize=True) assert acc_score >= 0.72 else: cuml_mod = cuRFR_mg(n_estimators=10, max_depth=8, n_bins=16, ignore_empty_partitions=True) cuml_mod.fit(X_train_df, y_train_df, broadcast_data=fit_broadcast) cuml_mod_predict = cuml_mod.predict(X_test_dask_array, broadcast_data=transform_broadcast) cuml_mod_predict = cuml_mod_predict.compute() cuml_mod_predict = cp.asnumpy(cuml_mod_predict) acc_score = r2_score(cuml_mod_predict, y_test) assert acc_score >= 0.72 if transform_broadcast: assert cuml_mod.internal_model is None
def test_rf_get_text(client, n_estimators, detailed_text): X, y = make_classification(n_samples=500, n_features=10, n_clusters_per_class=1, n_informative=5, random_state=94929, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) X, y = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg = cuRFC_mg(n_estimators=n_estimators, ignore_empty_partitions=True) cu_rf_mg.fit(X, y) if detailed_text: text_output = cu_rf_mg.get_detailed_text() else: text_output = cu_rf_mg.get_summary_text() # Test 1. Output is non-zero assert '' != text_output # Count the number of trees printed tree_count = 0 for line in text_output.split('\n'): if line.strip().startswith('Tree #'): tree_count += 1 # Test 2. Correct number of trees are printed assert n_estimators == tree_count
def test_rf_get_combined_model_right_aftter_fit(client, estimator_type): max_depth = 3 n_estimators = 5 X, y = make_classification() X = X.astype(np.float32) if estimator_type == 'classification': cu_rf_mg = cuRFC_mg(max_features=1.0, max_samples=1.0, n_bins=16, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) elif estimator_type == 'regression': cu_rf_mg = cuRFR_mg(max_features=1.0, max_samples=1.0, n_bins=16, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.float32) else: assert False X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg.fit(X_dask, y_dask) single_gpu_model = cu_rf_mg.get_combined_model() if estimator_type == 'classification': assert isinstance(single_gpu_model, cuRFC_sg) elif estimator_type == 'regression': assert isinstance(single_gpu_model, cuRFR_sg) else: assert False
def test_rf_instance_count(client, max_depth, n_estimators): n_workers = len(client.scheduler_info()['workers']) if n_estimators < n_workers: err_msg = "n_estimators cannot be lower than number of dask workers" pytest.xfail(err_msg) n_samples_per_worker = 350 X, y = make_classification(n_samples=n_samples_per_worker * n_workers, n_features=20, n_clusters_per_class=1, n_informative=10, random_state=123, n_classes=2) X = X.astype(np.float32) cu_rf_mg = cuRFC_mg(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=1, split_criterion=0, min_samples_leaf=2, seed=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg.fit(X_dask, y_dask) json_out = cu_rf_mg.get_json() json_obj = json.loads(json_out) # The instance count of each node must be equal to the sum of # the instance counts of its children def check_instance_count_for_non_leaf(tree): assert 'instance_count' in tree if 'children' not in tree: return assert 'instance_count' in tree['children'][0] assert 'instance_count' in tree['children'][1] assert ( tree['instance_count'] == tree['children'][0]['instance_count'] + tree['children'][1]['instance_count']) check_instance_count_for_non_leaf(tree['children'][0]) check_instance_count_for_non_leaf(tree['children'][1]) for tree in json_obj: check_instance_count_for_non_leaf(tree) # The root's count should be equal to the number of rows in the data assert tree['instance_count'] == n_samples_per_worker
def test_rf_classification_dask_fil_predict_proba(partitions_per_worker, cluster): c = Client(cluster) try: X, y = make_classification(n_samples=1000, n_features=30, n_clusters_per_class=1, n_informative=20, random_state=123, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=100, random_state=123) cu_rf_params = { 'n_bins': 16, 'n_streams': 1, 'n_estimators': 40, 'max_depth': 16 } X_train_df, y_train_df = _prep_training_data(c, X_train, y_train, partitions_per_worker) X_test_df, _ = _prep_training_data(c, X_test, y_test, partitions_per_worker) cu_rf_mg = cuRFC_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) fil_preds_proba = cu_rf_mg.predict_proba(X_test_df).compute() fil_preds_proba = cp.asnumpy(fil_preds_proba.to_gpu_matrix()) y_proba = np.zeros(np.shape(fil_preds_proba)) y_proba[:, 1] = y_test y_proba[:, 0] = 1.0 - y_test fil_mse = mean_squared_error(y_proba, fil_preds_proba) sk_model = skrfc(n_estimators=40, max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds_proba = sk_model.predict_proba(X_test) sk_mse = mean_squared_error(y_proba, sk_preds_proba) # The threshold is required as the test would intermitently # fail with a max difference of 0.022 between the two mse values assert fil_mse <= sk_mse + 0.022 finally: c.close()
def test_rf_classification_dask_fil_predict_proba(partitions_per_worker, client): n_workers = len(client.scheduler_info()['workers']) X, y = make_classification(n_samples=n_workers * 1500, n_features=30, n_clusters_per_class=1, n_informative=20, random_state=123, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=n_workers * 150, random_state=123) cu_rf_params = { 'n_bins': 16, 'n_streams': 1, 'n_estimators': 40, 'max_depth': 16 } X_train_df, y_train_df = _prep_training_data(client, X_train, y_train, partitions_per_worker) X_test_df, _ = _prep_training_data(client, X_test, y_test, partitions_per_worker) cu_rf_mg = cuRFC_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) fil_preds = cu_rf_mg.predict(X_test_df).compute() fil_preds = fil_preds.to_numpy() fil_preds_proba = cu_rf_mg.predict_proba(X_test_df).compute() fil_preds_proba = fil_preds_proba.to_numpy() np.testing.assert_equal(fil_preds, np.argmax(fil_preds_proba, axis=1)) y_proba = np.zeros(np.shape(fil_preds_proba)) y_proba[:, 1] = y_test y_proba[:, 0] = 1.0 - y_test fil_mse = mean_squared_error(y_proba, fil_preds_proba) sk_model = skrfc(n_estimators=40, max_depth=16, random_state=10) sk_model.fit(X_train, y_train) sk_preds_proba = sk_model.predict_proba(X_test) sk_mse = mean_squared_error(y_proba, sk_preds_proba) # The threshold is required as the test would intermitently # fail with a max difference of 0.029 between the two mse values assert fil_mse <= sk_mse + 0.029
def test_rf_classification_multi_class(partitions_per_worker, cluster): # Use CUDA_VISIBLE_DEVICES to control the number of workers c = Client(cluster) n_workers = len(c.scheduler_info()['workers']) try: X, y = make_classification(n_samples=n_workers * 5000, n_features=20, n_clusters_per_class=1, n_informative=10, random_state=123, n_classes=15) X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=n_workers * 300, random_state=123) cu_rf_params = { 'n_estimators': 25, 'max_depth': 16, 'n_bins': 256, 'random_state': 10, } X_train_df, y_train_df = _prep_training_data(c, X_train, y_train, partitions_per_worker) cuml_mod = cuRFC_mg(**cu_rf_params, ignore_empty_partitions=True) cuml_mod.fit(X_train_df, y_train_df) X_test_dask_array = from_array(X_test) cuml_preds_gpu = cuml_mod.predict(X_test_dask_array, predict_model="GPU").compute() acc_score_gpu = accuracy_score(cuml_preds_gpu, y_test) # the sklearn model when ran with the same parameters gives an # accuracy of 0.69. There is a difference of 0.0632 (6.32%) between # the two when the code runs on a single GPU (seen in the CI) # Refer to issue : https://github.com/rapidsai/cuml/issues/2806 for # more information on the threshold value. assert acc_score_gpu >= 0.55 finally: c.close()
def test_rf_classification(n_workers, partitions_per_worker): if dask_cuda.utils.get_n_gpus() < n_workers: pytest.skip("too few GPUs") cluster = LocalCUDACluster(threads_per_worker=1, n_workers=n_workers) c = Client(cluster) X, y = make_classification(n_samples=10000, n_features=20, n_clusters_per_class=1, n_informative=10, random_state=123, n_classes=5) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 25, 'max_depth': 13, 'n_bins': 15, } workers = c.has_what().keys() n_partitions = partitions_per_worker * len(workers) X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_train)) X_train_df = dask_cudf.from_cudf(X_cudf, npartitions=n_partitions) y_cudf = np.array(pd.DataFrame(y_train).values) y_cudf = y_cudf[:, 0] y_cudf = cudf.Series(y_cudf) y_train_df = \ dask_cudf.from_cudf(y_cudf, npartitions=n_partitions) X_train_df, y_train_df = dask_utils.persist_across_workers( c, [X_train_df, y_train_df], workers=workers) cu_rf_mg = cuRFC_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) cu_rf_mg_predict = cu_rf_mg.predict(X_test) acc_score = accuracy_score(cu_rf_mg_predict, y_test, normalize=True) assert acc_score > 0.8 c.close() cluster.close()
def test_rf_classification_dask_fil(partitions_per_worker, cluster, output_class): # Use CUDA_VISIBLE_DEVICES to control the number of workers c = Client(cluster) try: X, y = make_classification(n_samples=10000, n_features=30, n_clusters_per_class=1, n_informative=20, random_state=123, n_classes=2) X = X.astype(np.float32) y = y.astype(np.int32) X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=1000) cu_rf_params = { 'n_estimators': 25, 'max_depth': 13, 'n_bins': 15, } X_train_df, y_train_df = _prep_training_data(c, X_train, y_train, partitions_per_worker) X_test_df, _ = _prep_training_data(c, X_test, y_test, partitions_per_worker) cu_rf_mg = cuRFC_mg(**cu_rf_params) cu_rf_mg.fit(X_train_df, y_train_df) cu_rf_mg_predict = cu_rf_mg.predict(X_test_df, output_class).compute() cu_rf_mg_predict = cp.asnumpy(cp.array(cu_rf_mg_predict)) if not output_class: cu_rf_mg_predict = np.round(cu_rf_mg_predict) acc_score = accuracy_score(cu_rf_mg_predict, y_test, normalize=True) assert acc_score > 0.8 finally: c.close()
def test_rf_get_combined_model_right_aftter_fit(client, estimator_type): max_depth = 3 n_estimators = 5 n_workers = len(client.scheduler_info()['workers']) if n_estimators < n_workers: err_msg = "n_estimators cannot be lower than number of dask workers" pytest.xfail(err_msg) X, y = make_classification() X = X.astype(np.float32) if estimator_type == 'classification': cu_rf_mg = cuRFC_mg(max_features=1.0, max_samples=1.0, n_bins=16, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) elif estimator_type == 'regression': cu_rf_mg = cuRFR_mg(max_features=1.0, max_samples=1.0, n_bins=16, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.float32) else: assert False X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg.fit(X_dask, y_dask) single_gpu_model = cu_rf_mg.get_combined_model() if estimator_type == 'classification': assert isinstance(single_gpu_model, cuRFC_sg) elif estimator_type == 'regression': assert isinstance(single_gpu_model, cuRFR_sg) else: assert False
def test_rf_get_json(client, estimator_type, max_depth, n_estimators): n_workers = len(client.scheduler_info()['workers']) if n_estimators < n_workers: err_msg = "n_estimators cannot be lower than number of dask workers" pytest.xfail(err_msg) X, y = make_classification(n_samples=350, n_features=20, n_clusters_per_class=1, n_informative=10, random_state=123, n_classes=2) X = X.astype(np.float32) if estimator_type == 'classification': cu_rf_mg = cuRFC_mg(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=0, split_criterion=0, min_samples_leaf=2, seed=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.int32) elif estimator_type == 'regression': cu_rf_mg = cuRFR_mg(max_features=1.0, max_samples=1.0, n_bins=16, split_algo=0, min_samples_leaf=2, seed=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=max_depth) y = y.astype(np.float32) else: assert False X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) cu_rf_mg.fit(X_dask, y_dask) json_out = cu_rf_mg.get_json() json_obj = json.loads(json_out) # Test 1: Output is non-zero assert '' != json_out # Test 2: JSON object contains correct number of trees assert isinstance(json_obj, list) assert len(json_obj) == n_estimators # Test 3: Traverse JSON trees and get the same predictions as cuML RF def predict_with_json_tree(tree, x): if 'children' not in tree: assert 'leaf_value' in tree return tree['leaf_value'] assert 'split_feature' in tree assert 'split_threshold' in tree assert 'yes' in tree assert 'no' in tree if x[tree['split_feature']] <= tree['split_threshold']: return predict_with_json_tree(tree['children'][0], x) return predict_with_json_tree(tree['children'][1], x) def predict_with_json_rf_classifier(rf, x): # Returns the class with the highest vote. If there is a tie, return # the list of all classes with the highest vote. vote = [] for tree in rf: vote.append(predict_with_json_tree(tree, x)) vote = np.bincount(vote) max_vote = np.max(vote) majority_vote = np.nonzero(np.equal(vote, max_vote))[0] return majority_vote def predict_with_json_rf_regressor(rf, x): pred = 0. for tree in rf: pred += predict_with_json_tree(tree, x) return pred / len(rf) if estimator_type == 'classification': expected_pred = cu_rf_mg.predict(X_dask).astype(np.int32) expected_pred = expected_pred.compute().to_array() for idx, row in enumerate(X): majority_vote = predict_with_json_rf_classifier(json_obj, row) assert expected_pred[idx] in majority_vote elif estimator_type == 'regression': expected_pred = cu_rf_mg.predict(X_dask).astype(np.float32) expected_pred = expected_pred.compute().to_array() pred = [] for idx, row in enumerate(X): pred.append(predict_with_json_rf_regressor(json_obj, row)) pred = np.array(pred, dtype=np.float32) np.testing.assert_almost_equal(pred, expected_pred, decimal=6)