def test_logistic_regression(strategy, use_wrapper, nrows, num_classes, column_info, dtype=np.float32): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog() if use_wrapper: cls = cu_multiclass.MulticlassClassifier(culog, strategy=strategy) else: if (strategy == 'ovo'): cls = cu_multiclass.OneVsOneClassifier(culog) else: cls = cu_multiclass.OneVsRestClassifier(culog) cls.fit(X_train, y_train) test_score = cls.score(X_test, y_test) assert test_score > 0.7
def test_logistic_regression_decision_function( dtype, nrows, column_info, num_classes, fit_intercept ): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes ) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(fit_intercept=fit_intercept, output_type="numpy") culog.fit(X_train, y_train) sklog = skLog(fit_intercept=fit_intercept) sklog.coef_ = culog.coef_.T if fit_intercept: sklog.intercept_ = culog.intercept_ else: skLog.intercept_ = 0 sklog.classes_ = np.arange(num_classes) cu_dec_func = culog.decision_function(X_test) if num_classes > 2: cu_dec_func = cu_dec_func.T sk_dec_func = sklog.decision_function(X_test) assert array_equal(cu_dec_func, sk_dec_func)
def test_logistic_regression_predict_proba(dtype, nrows, column_info, num_classes, fit_intercept): ncols, n_info = column_info X_train, X_test, y_train, y_test = \ make_classification_dataset(datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(fit_intercept=fit_intercept) culog.fit(X_train, y_train) if num_classes > 2: sklog = skLog(fit_intercept=fit_intercept, solver="lbfgs", multi_class="multinomial") else: sklog = skLog(fit_intercept=fit_intercept) sklog.coef_ = culog.coef_.copy_to_host().T if fit_intercept: sklog.intercept_ = culog.intercept_.copy_to_host() else: skLog.intercept_ = 0 sklog.classes_ = np.arange(num_classes) cu_proba = culog.predict_proba(X_test).get() sk_proba = sklog.predict_proba(X_test) cu_log_proba = culog.predict_log_proba(X_test).get() sk_log_proba = sklog.predict_log_proba(X_test) assert array_equal(cu_proba, sk_proba) assert array_equal(cu_log_proba, sk_log_proba)
def test_logistic_predict_convert_dtype(train_dtype, test_dtype): X, y = make_classification(n_samples=50, n_features=10, random_state=0) X = X.astype(train_dtype) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0) clf = cuLog() clf.fit(X_train, y_train) clf.predict(X_test.astype(test_dtype))
def test_logistic_regression_attributes(): X, y = make_blobs() clf = cuLog().fit(X, y, convert_dtype=True) attrs = ["dtype", "solver_model", "coef_", "intercept_", "l1_ratio", "n_cols", "C", "penalty", "fit_intercept", "solver"] for attr in attrs: assert hasattr(clf, attr)
def test_logistic_regression(num_classes, dtype, penalty, l1_ratio, fit_intercept, nrows, column_info, C, tol): if penalty in ['l1', 'elasticnet']: pytest.xfail("OWL numerical stability is being improved") ncols, n_info = column_info # Checking sklearn >= 0.21 for testing elasticnet sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0") if not sk_check and penalty == 'elasticnet': pytest.skip("Need sklearn > 0.21 for testing logistic with" "elastic net.") X_train, X_test, y_train, y_test = \ make_classification_dataset(datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(penalty=penalty, l1_ratio=l1_ratio, C=C, fit_intercept=fit_intercept, tol=tol, verbose=0) culog.fit(X_train, y_train) # Only solver=saga supports elasticnet in scikit if penalty in ['elasticnet', 'l1']: if sk_check: sklog = skLog(penalty=penalty, l1_ratio=l1_ratio, solver='saga', C=C, fit_intercept=fit_intercept, multi_class='auto') else: sklog = skLog(penalty=penalty, solver='saga', C=C, fit_intercept=fit_intercept, multi_class='auto') else: sklog = skLog(penalty=penalty, solver='lbfgs', C=C, fit_intercept=fit_intercept, multi_class='auto') sklog.fit(X_train, y_train) # Setting tolerance to lowest possible per loss to detect regressions # as much as possible assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - 0.06
def test_logistic_regression_model_default(dtype): X_train, X_test, y_train, y_test = small_classification_dataset(dtype) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog() culog.fit(X_train, y_train) sklog = skLog(multi_class="auto") sklog.fit(X_train, y_train) assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - 0.022
def test_logistic_regression_input_type_consistency(constructor, dtype): from cudf.core.frame import Frame X = constructor([[5, 10], [3, 1], [7, 8]]).astype(dtype) y = constructor([0, 1, 1]).astype(dtype) clf = cuLog().fit(X, y, convert_dtype=True) original_type = type(X) if constructor == cudf.DataFrame: original_type = Frame assert isinstance(clf.predict_proba(X), original_type) assert isinstance(clf.predict(X), original_type)
def test_logistic_regression_weighting(regression_dataset, option, test_status): regression_type, data, coef, output = regression_dataset[test_status] class_weight = None sample_weight = None if option == 'sample_weight': n_samples = data.shape[0] sample_weight = np.abs(np.random.rand(n_samples)) elif option == 'class_weight': class_weight = np.random.rand(2) class_weight = {0: class_weight[0], 1: class_weight[1]} elif option == 'balanced': class_weight = 'balanced' culog = cuLog(fit_intercept=False, class_weight=class_weight) culog.fit(data, output, sample_weight=sample_weight) sklog = skLog(fit_intercept=False, class_weight=class_weight) sklog.fit(data, output, sample_weight=sample_weight) skcoef = np.squeeze(sklog.coef_) cucoef = np.squeeze(culog.coef_) if regression_type == 'binary': skcoef /= np.linalg.norm(skcoef) cucoef /= np.linalg.norm(cucoef) unit_tol = 0.04 total_tol = 0.08 elif regression_type.startswith('multiclass'): skcoef = skcoef.T skcoef /= np.linalg.norm(skcoef, axis=1)[:, None] cucoef /= np.linalg.norm(cucoef, axis=1)[:, None] unit_tol = 0.2 total_tol = 0.3 equality = array_equal(skcoef, cucoef, unit_tol=unit_tol, total_tol=total_tol) if not equality: print('\ncoef.shape: ', coef.shape) print('coef:\n', coef) print('cucoef.shape: ', cucoef.shape) print('cucoef:\n', cucoef) assert equality cuOut = culog.predict(data) skOut = sklog.predict(data) assert array_equal(skOut, cuOut, unit_tol=unit_tol, total_tol=total_tol)
def test_logistic_regression_unscaled(dtype, penalty, l1_ratio): # Test logistic regression on the breast cancer dataset. We do not scale # the dataset which could lead to numerical problems (fixed in PR #2543). X, y = load_breast_cancer(return_X_y=True) X = X.astype(dtype) y = y.astype(dtype) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = {"penalty": penalty, "C": 1, "tol": 1e-4, "fit_intercept": True, 'max_iter': 5000, "l1_ratio": l1_ratio} culog = cuLog(**params) culog.fit(X_train, y_train) score_test = 0.94 score_train = 0.94 assert culog.score(X_train, y_train) >= score_train assert culog.score(X_test, y_test) >= score_test
def test_logistic_regression_sparse_only(dtype, nlp_20news): # sklearn score with max_iter = 10000 sklearn_score = 0.878 acceptable_score = sklearn_score - 0.01 X, y = nlp_20news X = csr_matrix(X.astype(dtype)) y = y.get().astype(dtype) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) culog = cuLog() culog.fit(X_train, y_train) score = culog.score(X_test, y_test) assert score >= acceptable_score
def test_logistic_regression_predict_proba( dtype, nrows, column_info, num_classes, fit_intercept, sparse_input ): ncols, n_info = column_info X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes ) X_train = csr_matrix(X_train) if sparse_input else X_train X_test = csr_matrix(X_test) if sparse_input else X_test y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog(fit_intercept=fit_intercept, output_type="numpy") culog.fit(X_train, y_train) if num_classes > 2: sklog = skLog( fit_intercept=fit_intercept, solver="lbfgs", multi_class="multinomial" ) else: sklog = skLog(fit_intercept=fit_intercept) sklog.coef_ = culog.coef_.T if fit_intercept: sklog.intercept_ = culog.intercept_ else: skLog.intercept_ = 0 sklog.classes_ = np.arange(num_classes) cu_proba = culog.predict_proba(X_test) sk_proba = sklog.predict_proba(X_test) cu_log_proba = culog.predict_log_proba(X_test) sk_log_proba = sklog.predict_log_proba(X_test) assert array_equal(cu_proba, sk_proba) assert array_equal(cu_log_proba, sk_log_proba)
def test_logistic_regression_model_digits( dtype, order, sparse_input, fit_intercept, penalty): # smallest sklearn score with max_iter = 10000 # put it as a constant here, because sklearn 0.23.1 needs a lot of iters # to converge and has a bug returning an unrelated error if not converged. acceptable_score = 0.95 digits = load_digits() X_dense = digits.data.astype(dtype) X_dense.reshape(X_dense.shape, order=order) X = csr_matrix(X_dense) if sparse_input else X_dense y = digits.target.astype(dtype) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) culog = cuLog(fit_intercept=fit_intercept, penalty=penalty) culog.fit(X_train, y_train) score = culog.score(X_test, y_test) assert score >= acceptable_score
def test_logistic_regression( num_classes, dtype, penalty, l1_ratio, fit_intercept, nrows, column_info, C, tol ): ncols, n_info = column_info # Checking sklearn >= 0.21 for testing elasticnet sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0") if not sk_check and penalty == "elasticnet": pytest.skip( "Need sklearn > 0.21 for testing logistic with" "elastic net." ) X_train, X_test, y_train, y_test = make_classification_dataset( datatype=dtype, nrows=nrows, ncols=ncols, n_info=n_info, num_classes=num_classes ) y_train = y_train.astype(dtype) y_test = y_test.astype(dtype) culog = cuLog( penalty=penalty, l1_ratio=l1_ratio, C=C, fit_intercept=fit_intercept, tol=tol ) culog.fit(X_train, y_train) # Only solver=saga supports elasticnet in scikit if penalty in ["elasticnet", "l1"]: if sk_check: sklog = skLog( penalty=penalty, l1_ratio=l1_ratio, solver="saga", C=C, fit_intercept=fit_intercept, multi_class="auto", ) else: sklog = skLog( penalty=penalty, solver="saga", C=C, fit_intercept=fit_intercept, multi_class="auto", ) else: sklog = skLog( penalty=penalty, solver="lbfgs", C=C, fit_intercept=fit_intercept, multi_class="auto", ) sklog.fit(X_train, y_train) # Setting tolerance to lowest possible per loss to detect regressions # as much as possible cu_preds = culog.predict(X_test) tol_test = 0.012 tol_train = 0.006 if num_classes == 10 and penalty in ["elasticnet", "l1"]: tol_test *= 10 tol_train *= 10 assert culog.score(X_train, y_train) >= sklog.score(X_train, y_train) - \ tol_train assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - \ tol_test assert len(np.unique(cu_preds)) == len(np.unique(y_test))
def test_logistic_regression(num_classes, dtype, penalty, l1_ratio, fit_intercept): # Checking sklearn >= 0.21 for testing elasticnet sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0") if not sk_check and penalty == 'elasticnet': pytest.skip("Need sklearn > 0.21 for testing logistic with" "elastic net.") nrows = 100000 train_rows = np.int32(nrows * 0.8) X, y = make_classification(n_samples=nrows, n_features=num_classes, n_redundant=0, n_informative=2) X_test = np.asarray(X[train_rows:, 0:]).astype(dtype) X_train = np.asarray(X[0:train_rows, :]).astype(dtype) y_train = np.asarray(y[0:train_rows, ]).astype(dtype) culog = cuLog(penalty=penalty, l1_ratio=l1_ratio, C=5.0, fit_intercept=fit_intercept, tol=1e-8) culog.fit(X_train, y_train) # Only solver=saga supports elasticnet in scikit if penalty in ['elasticnet', 'l1']: if sk_check: sklog = skLog(penalty=penalty, l1_ratio=l1_ratio, solver='saga', C=5.0, fit_intercept=fit_intercept) else: sklog = skLog(penalty=penalty, solver='saga', C=5.0, fit_intercept=fit_intercept) elif penalty == 'l2': sklog = skLog(penalty=penalty, solver='lbfgs', C=5.0, fit_intercept=fit_intercept) else: if sk_check: sklog = skLog(penalty=penalty, solver='lbfgs', C=5.0, fit_intercept=fit_intercept) else: sklog = skLog(penalty='l2', solver='lbfgs', C=1e9, fit_intercept=fit_intercept) sklog.fit(X_train, y_train) preds = culog.predict(X_test) skpreds = sklog.predict(X_test) # Setting tolerance to lowest possible per loss to detect regressions # as much as possible if penalty in ['elasticnet', 'l1', 'l2']: assert np.sum(preds.to_array() != skpreds) / 20000 < 1e-1 else: # This is the only case where cuml and sklearn actually do a similar # lbfgs, other cases cuml does owl or sklearn does saga assert np.sum(preds.to_array() != skpreds) / 20000 < 1e-3