def test_gaussian_parameters(priors, var_smoothing, nlp_20news): x_dtype = cp.float32 y_dtype = cp.int32 nrows = 150 X, y = nlp_20news X = sparse_scipy_to_cp(X[:nrows], x_dtype).todense() y = y.astype(y_dtype)[:nrows] if priors == 'balanced': priors = cp.array([1 / 20] * 20) elif priors == 'unbalanced': priors = cp.linspace(0.01, 0.09, 20) model = GaussianNB(priors=priors, var_smoothing=var_smoothing) model_sk = skGNB(priors=priors.get() if priors is not None else None, var_smoothing=var_smoothing) model.fit(X, y) model_sk.fit(X.get(), y.get()) y_hat = model.predict(X) y_hat_sk = model_sk.predict(X.get()) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) assert_allclose(model.epsilon_.get(), model_sk.epsilon_, rtol=1e-4) assert_array_equal(y_hat, y_hat_sk)
def test_gaussian_fit_predict(x_dtype, y_dtype, is_sparse, nlp_20news): """ Cupy Test """ X, y = nlp_20news model = GaussianNB() n_rows = 500 n_cols = int(2e5) X = sparse_scipy_to_cp(X, x_dtype) X = X.tocsr()[:n_rows, :n_cols] if is_sparse: y = y.astype(y_dtype)[:n_rows] model.fit(X, y) else: X = X.todense() y = y[:n_rows].astype(y_dtype) model.fit(np.ascontiguousarray(cp.asnumpy(X).astype(x_dtype)), y) y_hat = model.predict(X) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) assert accuracy_score(y, y_hat) >= 0.99
def test_gaussian_basic(): # Data is just 6 separable points in the plane X = cp.array([[-2, -1, -1], [-1, -1, -1], [-1, -2, -1], [1, 1, 1], [1, 2, 1], [2, 1, 1]], dtype=cp.float32) y = cp.array([1, 1, 1, 2, 2, 2]) skclf = skGNB() skclf.fit(X.get(), y.get()) clf = GaussianNB() clf.fit(X, y) assert_array_almost_equal(clf.theta_.get(), skclf.theta_, 6) assert_array_almost_equal(clf.sigma_.get(), skclf.sigma_, 6) y_pred = clf.predict(X) y_pred_proba = clf.predict_proba(X) y_pred_log_proba = clf.predict_log_proba(X) y_pred_proba_sk = skclf.predict_proba(X.get()) y_pred_log_proba_sk = skclf.predict_log_proba(X.get()) assert_array_equal(y_pred.get(), y.get()) assert_array_almost_equal(y_pred_proba.get(), y_pred_proba_sk, 8) assert_allclose(y_pred_log_proba.get(), y_pred_log_proba_sk, atol=1e-2, rtol=1e-2)
def test_gaussian_partial_fit(nlp_20news): chunk_size = 250 n_rows = 1500 x_dtype, y_dtype = cp.float32, cp.int32 X, y = nlp_20news X = sparse_scipy_to_cp(X, x_dtype).tocsr()[:n_rows] y = y.astype(y_dtype)[:n_rows] model = GaussianNB() classes = np.unique(y) total_fit = 0 for i in range(math.ceil(X.shape[0] / chunk_size)): upper = i * chunk_size + chunk_size if upper > X.shape[0]: upper = -1 if upper > 0: x = X[i * chunk_size:upper] y_c = y[i * chunk_size:upper] else: x = X[i * chunk_size:] y_c = y[i * chunk_size:] model.partial_fit(x, y_c, classes=classes) total_fit += (upper - (i * chunk_size)) if upper == -1: break y_hat = model.predict(X) y_hat = cp.asnumpy(y_hat) y = cp.asnumpy(y) assert accuracy_score(y, y_hat) >= 0.99 # Test whether label mismatch between target y and classes raises an Error assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=cp.array([0, 1])) # Raise because classes is required on first call of partial_fit assert_raises(ValueError, GaussianNB().partial_fit, X, y)