def test_tsvd_fit_transform(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': skpca = skTSVD(n_components=1) Xsktsvd = skpca.fit_transform(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) Xcutsvd = cutsvd.fit_transform(X) cutsvd.handle.sync() if name != 'blobs': assert array_equal(Xcutsvd, Xsktsvd, 1e-3, with_sign=True)
def test_dbscan_propagation(datatype, use_handle, out_dtype, n_samples): X, y = make_blobs(n_samples, centers=1, cluster_std=8.0, center_box=(-100.0, 100.0), random_state=8) X = X.astype(datatype) handle, stream = get_handle(use_handle) eps = 0.5 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=5, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) sk_dbscan = skDBSCAN(eps=eps, min_samples=5) sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps)
def test_rf_regression(special_reg, datatype, max_features, max_samples, n_bins): use_handle = True X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr( max_features=max_features, max_samples=max_samples, n_bins=n_bins, split_criterion=2, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=50, handle=handle, max_leaves=-1, max_depth=16, accuracy_metric="mse", ) cuml_model.fit(X_train, y_train) # predict using FIL fil_preds = cuml_model.predict(X_test, predict_model="GPU") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cu_r2 = r2_score(y_test, cu_preds, convert_dtype=datatype) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress" sk_model = skrfr( n_estimators=50, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07) assert fil_r2 >= (cu_r2 - 0.02)
def test_rf_classification(small_clf, datatype, max_samples, max_features): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc( max_features=max_features, max_samples=max_samples, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16, ) cuml_model.fit(X_train, y_train) fil_preds = cuml_model.predict(X_test, predict_model="GPU", threshold=0.5, algo="auto") cu_preds = cuml_model.predict(X_test, predict_model="CPU") fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) cuml_acc = accuracy_score(y_test, cu_preds) fil_acc = accuracy_score(y_test, fil_preds) if X.shape[0] < 500000: sk_model = skrfc( n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07) assert fil_acc >= ( cuml_acc - 0.07 ) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa
def test_rf_get_text(n_estimators, detailed_text): X, y = make_classification( n_samples=500, n_features=10, n_clusters_per_class=1, n_informative=5, random_state=94929, n_classes=2, ) X = X.astype(np.float32) y = y.astype(np.int32) # Create a handle for the cuml model handle, stream = get_handle(True, n_streams=1) # Initialize cuML Random Forest classification model cuml_model = curfc( handle=handle, max_features=1.0, max_samples=1.0, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=23707, n_streams=1, n_estimators=n_estimators, max_leaves=-1, max_depth=16, ) # Train model on the data cuml_model.fit(X, y) if detailed_text: text_output = cuml_model.get_detailed_text() else: text_output = cuml_model.get_summary_text() # Test 1: Output is non-zero assert "" != text_output # Count the number of trees printed tree_count = 0 for line in text_output.split("\n"): if line.strip().startswith("Tree #"): tree_count += 1 # Test 2: Correct number of trees are printed assert n_estimators == tree_count
def test_dbscan(datatype, use_handle, nrows, ncols, max_mbytes_per_batch, out_dtype): if nrows == 500000 and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: nrows = nrows * pytest.max_gpu_memory // 32 else: pytest.skip("Insufficient GPU memory for this test. " "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") n_samples = nrows n_feats = ncols X, y = make_blobs(n_samples=n_samples, cluster_std=0.01, n_features=n_feats, random_state=0) handle, stream = get_handle(use_handle) eps = 1 cuml_dbscan = cuDBSCAN(handle=handle, eps=eps, min_samples=2, max_mbytes_per_batch=max_mbytes_per_batch, output_type='numpy') cu_labels = cuml_dbscan.fit_predict(X, out_dtype=out_dtype) if nrows < 500000: sk_dbscan = skDBSCAN(eps=1, min_samples=2, algorithm="brute") sk_labels = sk_dbscan.fit_predict(X) # Check the core points are equal assert array_equal(cuml_dbscan.core_sample_indices_, sk_dbscan.core_sample_indices_) # Check the labels are correct assert_dbscan_equal(sk_labels, cu_labels, X, cuml_dbscan.core_sample_indices_, eps) if out_dtype == "int32" or out_dtype == np.int32: assert cu_labels.dtype == np.int32 elif out_dtype == "int64" or out_dtype == np.int64: assert cu_labels.dtype == np.int64
def test_rf_memory_leakage(small_clf, datatype, fil_sparse_format, n_iter): use_handle = True X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Warmup. Some modules that are used in RF allocate space on the device # and consume memory. This is to make sure that the allocation is done # before the first call to get_memory_info. base_model = curfc(handle=handle) base_model.fit(X_train, y_train) handle.sync() # just to be sure free_mem = cuda.current_context().get_memory_info()[0] def test_for_memory_leak(): cuml_mods = curfc(handle=handle) cuml_mods.fit(X_train, y_train) handle.sync() # just to be sure # Calculate the memory free after fitting the cuML model delta_mem = free_mem - cuda.current_context().get_memory_info()[0] assert delta_mem == 0 for i in range(2): cuml_mods.predict( X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, ) handle.sync() # just to be sure # Calculate the memory free after predicting the cuML model delta_mem = free_mem - cuda.current_context().get_memory_info()[0] assert delta_mem == 0 for i in range(n_iter): test_for_memory_leak()
def test_pca_inverse_transform(datatype, input_type, name, use_handle, nrows): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) else: rng = np.random.RandomState(0) n, p = nrows, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [3, 4, 2] # make a large mean handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) X_cupca = cupca.fit_transform(X) input_gdf = cupca.inverse_transform(X_cupca) cupca.handle.sync() assert array_equal(input_gdf, X, 5e-5, with_sign=True)
def test_pca_fit_then_transform(datatype, input_type, name, use_handle): blobs_n_samples = 500000 if name == 'blobs' and pytest.max_gpu_memory < 32: if pytest.adapt_stress_test: blobs_n_samples = int(blobs_n_samples * pytest.max_gpu_memory / 32) else: pytest.skip("Insufficient GPU memory for this test." "Re-run with 'CUML_ADAPT_STRESS_TESTS=True'") if name == 'blobs': X, y = make_blobs(n_samples=blobs_n_samples, n_features=1000, random_state=0) elif name == 'iris': iris = datasets.load_iris() X = iris.data else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) if name != 'blobs': skpca = skPCA(n_components=2) skpca.fit(X) Xskpca = skpca.transform(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) cupca.fit(X) X_cupca = cupca.transform(X) cupca.handle.sync() if name != 'blobs': assert array_equal(X_cupca, Xskpca, 1e-3, with_sign=False) assert Xskpca.shape[0] == X_cupca.shape[0] assert Xskpca.shape[1] == X_cupca.shape[1]
def test_tsvd_fit(datatype, name, use_handle): if name == 'blobs': X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'random': pytest.skip('fails when using random dataset ' 'used by sklearn for testing') shape = 5000, 100 rng = check_random_state(42) X = rng.randint(-100, 20, np.product(shape)).reshape(shape) else: n, p = 500, 5 rng = np.random.RandomState(0) X = rng.randn(n, p) * .1 + np.array([3, 4, 2, 3, 5]) if name != 'blobs': sktsvd = skTSVD(n_components=1) sktsvd.fit(X) handle, stream = get_handle(use_handle) cutsvd = cuTSVD(n_components=1, handle=handle) cutsvd.fit(X) cutsvd.handle.sync() if name != 'blobs': for attr in [ 'singular_values_', 'components_', 'explained_variance_ratio_' ]: with_sign = False if attr in ['components_'] else True assert array_equal(getattr(cutsvd, attr), getattr(sktsvd, attr), 0.4, with_sign=with_sign)
def test_pca_fit(datatype, input_type, name, use_handle): if name == 'blobs': pytest.skip('fails when using blobs dataset') X, y = make_blobs(n_samples=500000, n_features=1000, random_state=0) elif name == 'digits': X, _ = datasets.load_digits(return_X_y=True) else: X, Y = make_multilabel_classification(n_samples=500, n_classes=2, n_labels=1, allow_unlabeled=False, random_state=1) skpca = skPCA(n_components=2) skpca.fit(X) handle, stream = get_handle(use_handle) cupca = cuPCA(n_components=2, handle=handle) cupca.fit(X) cupca.handle.sync() for attr in [ 'singular_values_', 'components_', 'explained_variance_', 'explained_variance_ratio_' ]: with_sign = False if attr in ['components_'] else True print(attr) print(getattr(cupca, attr)) print(getattr(skpca, attr)) cuml_res = (getattr(cupca, attr)) skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-3, with_sign=with_sign)
def test_rf_regression_sparse(special_reg, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = special_reg X = X.astype(datatype) y = y.astype(datatype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize and fit using cuML's random forest regression model cuml_model = curfr( n_bins=16, split_criterion=2, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, accuracy_metric="mse", ) cuml_model.fit(X_train, y_train) # predict using FIL if (not fil_sparse_format or algo == "tree_reorg" or algo == "batch_tree_reorg") or fil_sparse_format == "not_supported": with pytest.raises(ValueError): fil_preds = cuml_model.predict( X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo, ) else: fil_preds = cuml_model.predict( X_test, predict_model="GPU", fil_sparse_format=fil_sparse_format, algo=algo, ) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_r2 = r2_score(y_test, fil_preds, convert_dtype=datatype) fil_model = cuml_model.convert_to_fil_model() with cuml.using_output_type("numpy"): fil_model_preds = fil_model.predict(X_test) fil_model_preds = np.reshape(fil_model_preds, np.shape(y_test)) fil_model_r2 = r2_score(y_test, fil_model_preds, convert_dtype=datatype) assert fil_r2 == fil_model_r2 tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features # Initialize, fit and predict using # sklearn's random forest regression model if X.shape[0] < 1000: # mode != "stress": sk_model = skrfr( n_estimators=50, max_depth=40, min_samples_split=2, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_r2 = r2_score(y_test, sk_preds, convert_dtype=datatype) assert fil_r2 >= (sk_r2 - 0.07)
def test_rf_classification_sparse(small_clf, datatype, fil_sparse_format, algo): use_handle = True num_treees = 50 X, y = small_clf X = X.astype(datatype) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) # Create a handle for the cuml model handle, stream = get_handle(use_handle, n_streams=1) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc( n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_streams=1, n_estimators=num_treees, handle=handle, max_leaves=-1, max_depth=40, ) cuml_model.fit(X_train, y_train) if (not fil_sparse_format or algo == "tree_reorg" or algo == "batch_tree_reorg") or fil_sparse_format == "not_supported": with pytest.raises(ValueError): fil_preds = cuml_model.predict( X_test, predict_model="GPU", threshold=0.5, fil_sparse_format=fil_sparse_format, algo=algo, ) else: fil_preds = cuml_model.predict( X_test, predict_model="GPU", threshold=0.5, fil_sparse_format=fil_sparse_format, algo=algo, ) fil_preds = np.reshape(fil_preds, np.shape(y_test)) fil_acc = accuracy_score(y_test, fil_preds) np.testing.assert_almost_equal(fil_acc, cuml_model.score(X_test, y_test)) fil_model = cuml_model.convert_to_fil_model() with cuml.using_output_type("numpy"): fil_model_preds = fil_model.predict(X_test) fil_model_acc = accuracy_score(y_test, fil_model_preds) assert fil_acc == fil_model_acc tl_model = cuml_model.convert_to_treelite_model() assert num_treees == tl_model.num_trees assert X.shape[1] == tl_model.num_features if X.shape[0] < 500000: sk_model = skrfc( n_estimators=50, max_depth=40, min_samples_split=2, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) assert fil_acc >= (sk_acc - 0.07)
def rf_classification(datatype, array_type, max_features, max_samples, fixture): X, y = fixture X = X.astype(datatype[0]) y = y.astype(np.int32) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) X_test = X_test.astype(datatype[1]) n_streams = 1 handle, stream = get_handle(True, n_streams=n_streams) # Initialize, fit and predict using cuML's # random forest classification model cuml_model = curfc( max_features=max_features, max_samples=max_samples, n_bins=16, split_criterion=0, min_samples_leaf=2, random_state=123, n_estimators=40, handle=handle, max_leaves=-1, max_depth=16, n_streams=n_streams, ) if array_type == "dataframe": X_train_df = cudf.DataFrame(X_train) y_train_df = cudf.Series(y_train) X_test_df = cudf.DataFrame(X_test) cuml_model.fit(X_train_df, y_train_df) cu_proba_gpu = cuml_model.predict_proba(X_test_df).to_numpy() cu_preds_cpu = cuml_model.predict(X_test_df, predict_model="CPU").to_numpy() cu_preds_gpu = cuml_model.predict(X_test_df, predict_model="GPU").to_numpy() else: cuml_model.fit(X_train, y_train) cu_proba_gpu = cuml_model.predict_proba(X_test) cu_preds_cpu = cuml_model.predict(X_test, predict_model="CPU") cu_preds_gpu = cuml_model.predict(X_test, predict_model="GPU") np.testing.assert_array_equal(cu_preds_gpu, np.argmax(cu_proba_gpu, axis=1)) cu_acc_cpu = accuracy_score(y_test, cu_preds_cpu) cu_acc_gpu = accuracy_score(y_test, cu_preds_gpu) assert cu_acc_cpu == pytest.approx(cu_acc_gpu, abs=0.01, rel=0.1) # sklearn random forest classification model # initialization, fit and predict if y.size < 500000: sk_model = skrfc( n_estimators=40, max_depth=16, min_samples_split=2, max_features=max_features, random_state=10, ) sk_model.fit(X_train, y_train) sk_preds = sk_model.predict(X_test) sk_acc = accuracy_score(y_test, sk_preds) sk_proba = sk_model.predict_proba(X_test) assert cu_acc_cpu >= sk_acc - 0.07 assert cu_acc_gpu >= sk_acc - 0.07 # 0.06 is the highest relative error observed on CI, within # 0.0061 absolute error boundaries seen previously check_predict_proba(cu_proba_gpu, sk_proba, y_test, 0.1)