Beispiel #1
0
def test_logistic_regression(strategy,
                             use_wrapper,
                             nrows,
                             num_classes,
                             column_info,
                             dtype=np.float32):

    ncols, n_info = column_info

    X_train, X_test, y_train, y_test = make_classification_dataset(
        datatype=dtype,
        nrows=nrows,
        ncols=ncols,
        n_info=n_info,
        num_classes=num_classes)
    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)
    culog = cuLog()

    if use_wrapper:
        cls = cu_multiclass.MulticlassClassifier(culog, strategy=strategy)
    else:
        if (strategy == 'ovo'):
            cls = cu_multiclass.OneVsOneClassifier(culog)
        else:
            cls = cu_multiclass.OneVsRestClassifier(culog)

    cls.fit(X_train, y_train)
    test_score = cls.score(X_test, y_test)
    assert test_score > 0.7
Beispiel #2
0
def test_logistic_regression_decision_function(
    dtype, nrows, column_info, num_classes, fit_intercept
):
    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_classification_dataset(
        datatype=dtype, nrows=nrows, ncols=ncols,
        n_info=n_info, num_classes=num_classes
    )

    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)

    culog = cuLog(fit_intercept=fit_intercept, output_type="numpy")
    culog.fit(X_train, y_train)

    sklog = skLog(fit_intercept=fit_intercept)
    sklog.coef_ = culog.coef_.T
    if fit_intercept:
        sklog.intercept_ = culog.intercept_
    else:
        skLog.intercept_ = 0
    sklog.classes_ = np.arange(num_classes)

    cu_dec_func = culog.decision_function(X_test)
    if num_classes > 2:
        cu_dec_func = cu_dec_func.T
    sk_dec_func = sklog.decision_function(X_test)

    assert array_equal(cu_dec_func, sk_dec_func)
Beispiel #3
0
def test_logistic_regression_predict_proba(dtype, nrows, column_info,
                                           num_classes, fit_intercept):
    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = \
        make_classification_dataset(datatype=dtype, nrows=nrows,
                                    ncols=ncols, n_info=n_info,
                                    num_classes=num_classes)

    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)

    culog = cuLog(fit_intercept=fit_intercept)
    culog.fit(X_train, y_train)

    if num_classes > 2:
        sklog = skLog(fit_intercept=fit_intercept, solver="lbfgs",
                      multi_class="multinomial")
    else:
        sklog = skLog(fit_intercept=fit_intercept)
    sklog.coef_ = culog.coef_.copy_to_host().T
    if fit_intercept:
        sklog.intercept_ = culog.intercept_.copy_to_host()
    else:
        skLog.intercept_ = 0
    sklog.classes_ = np.arange(num_classes)

    cu_proba = culog.predict_proba(X_test).get()
    sk_proba = sklog.predict_proba(X_test)

    cu_log_proba = culog.predict_log_proba(X_test).get()
    sk_log_proba = sklog.predict_log_proba(X_test)

    assert array_equal(cu_proba, sk_proba)
    assert array_equal(cu_log_proba, sk_log_proba)
Beispiel #4
0
def test_logistic_predict_convert_dtype(train_dtype, test_dtype):
    X, y = make_classification(n_samples=50, n_features=10, random_state=0)
    X = X.astype(train_dtype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,
                                                        random_state=0)

    clf = cuLog()
    clf.fit(X_train, y_train)
    clf.predict(X_test.astype(test_dtype))
Beispiel #5
0
def test_logistic_regression_attributes():
    X, y = make_blobs()
    clf = cuLog().fit(X, y, convert_dtype=True)

    attrs = ["dtype", "solver_model", "coef_", "intercept_",
             "l1_ratio", "n_cols", "C", "penalty",
             "fit_intercept", "solver"]

    for attr in attrs:
        assert hasattr(clf, attr)
Beispiel #6
0
def test_logistic_regression(num_classes, dtype, penalty, l1_ratio,
                             fit_intercept, nrows, column_info, C, tol):
    if penalty in ['l1', 'elasticnet']:
        pytest.xfail("OWL numerical stability is being improved")

    ncols, n_info = column_info
    # Checking sklearn >= 0.21 for testing elasticnet
    sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0")
    if not sk_check and penalty == 'elasticnet':
        pytest.skip("Need sklearn > 0.21 for testing logistic with"
                    "elastic net.")

    X_train, X_test, y_train, y_test = \
        make_classification_dataset(datatype=dtype, nrows=nrows,
                                    ncols=ncols, n_info=n_info,
                                    num_classes=num_classes)
    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)
    culog = cuLog(penalty=penalty,
                  l1_ratio=l1_ratio,
                  C=C,
                  fit_intercept=fit_intercept,
                  tol=tol,
                  verbose=0)
    culog.fit(X_train, y_train)

    # Only solver=saga supports elasticnet in scikit
    if penalty in ['elasticnet', 'l1']:
        if sk_check:
            sklog = skLog(penalty=penalty,
                          l1_ratio=l1_ratio,
                          solver='saga',
                          C=C,
                          fit_intercept=fit_intercept,
                          multi_class='auto')
        else:
            sklog = skLog(penalty=penalty,
                          solver='saga',
                          C=C,
                          fit_intercept=fit_intercept,
                          multi_class='auto')
    else:
        sklog = skLog(penalty=penalty,
                      solver='lbfgs',
                      C=C,
                      fit_intercept=fit_intercept,
                      multi_class='auto')

    sklog.fit(X_train, y_train)

    # Setting tolerance to lowest possible per loss to detect regressions
    # as much as possible

    assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - 0.06
Beispiel #7
0
def test_logistic_regression_model_default(dtype):

    X_train, X_test, y_train, y_test = small_classification_dataset(dtype)
    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)
    culog = cuLog()
    culog.fit(X_train, y_train)
    sklog = skLog(multi_class="auto")

    sklog.fit(X_train, y_train)

    assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - 0.022
Beispiel #8
0
def test_logistic_regression_input_type_consistency(constructor, dtype):
    from cudf.core.frame import Frame

    X = constructor([[5, 10], [3, 1], [7, 8]]).astype(dtype)
    y = constructor([0, 1, 1]).astype(dtype)
    clf = cuLog().fit(X, y, convert_dtype=True)

    original_type = type(X)
    if constructor == cudf.DataFrame:
        original_type = Frame

    assert isinstance(clf.predict_proba(X), original_type)
    assert isinstance(clf.predict(X), original_type)
Beispiel #9
0
def test_logistic_regression_weighting(regression_dataset, option,
                                       test_status):
    regression_type, data, coef, output = regression_dataset[test_status]

    class_weight = None
    sample_weight = None
    if option == 'sample_weight':
        n_samples = data.shape[0]
        sample_weight = np.abs(np.random.rand(n_samples))
    elif option == 'class_weight':
        class_weight = np.random.rand(2)
        class_weight = {0: class_weight[0], 1: class_weight[1]}
    elif option == 'balanced':
        class_weight = 'balanced'

    culog = cuLog(fit_intercept=False, class_weight=class_weight)
    culog.fit(data, output, sample_weight=sample_weight)

    sklog = skLog(fit_intercept=False, class_weight=class_weight)
    sklog.fit(data, output, sample_weight=sample_weight)

    skcoef = np.squeeze(sklog.coef_)
    cucoef = np.squeeze(culog.coef_)
    if regression_type == 'binary':
        skcoef /= np.linalg.norm(skcoef)
        cucoef /= np.linalg.norm(cucoef)
        unit_tol = 0.04
        total_tol = 0.08
    elif regression_type.startswith('multiclass'):
        skcoef = skcoef.T
        skcoef /= np.linalg.norm(skcoef, axis=1)[:, None]
        cucoef /= np.linalg.norm(cucoef, axis=1)[:, None]
        unit_tol = 0.2
        total_tol = 0.3

    equality = array_equal(skcoef,
                           cucoef,
                           unit_tol=unit_tol,
                           total_tol=total_tol)
    if not equality:
        print('\ncoef.shape: ', coef.shape)
        print('coef:\n', coef)
        print('cucoef.shape: ', cucoef.shape)
        print('cucoef:\n', cucoef)
    assert equality

    cuOut = culog.predict(data)
    skOut = sklog.predict(data)
    assert array_equal(skOut, cuOut, unit_tol=unit_tol, total_tol=total_tol)
Beispiel #10
0
def test_logistic_regression_unscaled(dtype, penalty, l1_ratio):
    # Test logistic regression on the breast cancer dataset. We do not scale
    # the dataset which could lead to numerical problems (fixed in PR #2543).
    X, y = load_breast_cancer(return_X_y=True)
    X = X.astype(dtype)
    y = y.astype(dtype)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    params = {"penalty": penalty, "C": 1, "tol": 1e-4, "fit_intercept": True,
              'max_iter': 5000, "l1_ratio": l1_ratio}
    culog = cuLog(**params)
    culog.fit(X_train, y_train)

    score_test = 0.94
    score_train = 0.94

    assert culog.score(X_train, y_train) >= score_train
    assert culog.score(X_test, y_test) >= score_test
Beispiel #11
0
def test_logistic_regression_sparse_only(dtype, nlp_20news):

    # sklearn score with max_iter = 10000
    sklearn_score = 0.878
    acceptable_score = sklearn_score - 0.01

    X, y = nlp_20news

    X = csr_matrix(X.astype(dtype))
    y = y.get().astype(dtype)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    culog = cuLog()
    culog.fit(X_train, y_train)
    score = culog.score(X_test, y_test)

    assert score >= acceptable_score
Beispiel #12
0
def test_logistic_regression_predict_proba(
    dtype, nrows, column_info, num_classes, fit_intercept, sparse_input
):
    ncols, n_info = column_info
    X_train, X_test, y_train, y_test = make_classification_dataset(
        datatype=dtype, nrows=nrows, ncols=ncols,
        n_info=n_info, num_classes=num_classes
    )
    X_train = csr_matrix(X_train) if sparse_input else X_train
    X_test = csr_matrix(X_test) if sparse_input else X_test

    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)

    culog = cuLog(fit_intercept=fit_intercept, output_type="numpy")
    culog.fit(X_train, y_train)

    if num_classes > 2:
        sklog = skLog(
            fit_intercept=fit_intercept,
            solver="lbfgs",
            multi_class="multinomial"
        )
    else:
        sklog = skLog(fit_intercept=fit_intercept)
    sklog.coef_ = culog.coef_.T
    if fit_intercept:
        sklog.intercept_ = culog.intercept_
    else:
        skLog.intercept_ = 0
    sklog.classes_ = np.arange(num_classes)

    cu_proba = culog.predict_proba(X_test)
    sk_proba = sklog.predict_proba(X_test)

    cu_log_proba = culog.predict_log_proba(X_test)
    sk_log_proba = sklog.predict_log_proba(X_test)

    assert array_equal(cu_proba, sk_proba)
    assert array_equal(cu_log_proba, sk_log_proba)
Beispiel #13
0
def test_logistic_regression_model_digits(
        dtype, order, sparse_input, fit_intercept, penalty):

    # smallest sklearn score with max_iter = 10000
    # put it as a constant here, because sklearn 0.23.1 needs a lot of iters
    # to converge and has a bug returning an unrelated error if not converged.
    acceptable_score = 0.95

    digits = load_digits()

    X_dense = digits.data.astype(dtype)
    X_dense.reshape(X_dense.shape, order=order)
    X = csr_matrix(X_dense) if sparse_input else X_dense

    y = digits.target.astype(dtype)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    culog = cuLog(fit_intercept=fit_intercept, penalty=penalty)
    culog.fit(X_train, y_train)
    score = culog.score(X_test, y_test)

    assert score >= acceptable_score
Beispiel #14
0
def test_logistic_regression(
    num_classes, dtype, penalty, l1_ratio,
    fit_intercept, nrows, column_info, C, tol
):
    ncols, n_info = column_info
    # Checking sklearn >= 0.21 for testing elasticnet
    sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0")
    if not sk_check and penalty == "elasticnet":
        pytest.skip(
            "Need sklearn > 0.21 for testing logistic with" "elastic net."
        )

    X_train, X_test, y_train, y_test = make_classification_dataset(
        datatype=dtype, nrows=nrows, ncols=ncols,
        n_info=n_info, num_classes=num_classes
    )
    y_train = y_train.astype(dtype)
    y_test = y_test.astype(dtype)
    culog = cuLog(
        penalty=penalty, l1_ratio=l1_ratio, C=C,
        fit_intercept=fit_intercept, tol=tol
    )
    culog.fit(X_train, y_train)

    # Only solver=saga supports elasticnet in scikit
    if penalty in ["elasticnet", "l1"]:
        if sk_check:
            sklog = skLog(
                penalty=penalty,
                l1_ratio=l1_ratio,
                solver="saga",
                C=C,
                fit_intercept=fit_intercept,
                multi_class="auto",
            )
        else:
            sklog = skLog(
                penalty=penalty,
                solver="saga",
                C=C,
                fit_intercept=fit_intercept,
                multi_class="auto",
            )
    else:
        sklog = skLog(
            penalty=penalty,
            solver="lbfgs",
            C=C,
            fit_intercept=fit_intercept,
            multi_class="auto",
        )

    sklog.fit(X_train, y_train)

    # Setting tolerance to lowest possible per loss to detect regressions
    # as much as possible
    cu_preds = culog.predict(X_test)
    tol_test = 0.012
    tol_train = 0.006
    if num_classes == 10 and penalty in ["elasticnet", "l1"]:
        tol_test *= 10
        tol_train *= 10

    assert culog.score(X_train, y_train) >= sklog.score(X_train, y_train) - \
        tol_train
    assert culog.score(X_test, y_test) >= sklog.score(X_test, y_test) - \
        tol_test
    assert len(np.unique(cu_preds)) == len(np.unique(y_test))
Beispiel #15
0
def test_logistic_regression(num_classes, dtype, penalty, l1_ratio,
                             fit_intercept):

    # Checking sklearn >= 0.21 for testing elasticnet
    sk_check = LooseVersion(str(sklearn.__version__)) >= LooseVersion("0.21.0")
    if not sk_check and penalty == 'elasticnet':
        pytest.skip("Need sklearn > 0.21 for testing logistic with"
                    "elastic net.")

    nrows = 100000
    train_rows = np.int32(nrows * 0.8)
    X, y = make_classification(n_samples=nrows,
                               n_features=num_classes,
                               n_redundant=0,
                               n_informative=2)

    X_test = np.asarray(X[train_rows:, 0:]).astype(dtype)
    X_train = np.asarray(X[0:train_rows, :]).astype(dtype)
    y_train = np.asarray(y[0:train_rows, ]).astype(dtype)

    culog = cuLog(penalty=penalty,
                  l1_ratio=l1_ratio,
                  C=5.0,
                  fit_intercept=fit_intercept,
                  tol=1e-8)
    culog.fit(X_train, y_train)

    # Only solver=saga supports elasticnet in scikit
    if penalty in ['elasticnet', 'l1']:
        if sk_check:
            sklog = skLog(penalty=penalty,
                          l1_ratio=l1_ratio,
                          solver='saga',
                          C=5.0,
                          fit_intercept=fit_intercept)
        else:
            sklog = skLog(penalty=penalty,
                          solver='saga',
                          C=5.0,
                          fit_intercept=fit_intercept)
    elif penalty == 'l2':
        sklog = skLog(penalty=penalty,
                      solver='lbfgs',
                      C=5.0,
                      fit_intercept=fit_intercept)
    else:
        if sk_check:
            sklog = skLog(penalty=penalty,
                          solver='lbfgs',
                          C=5.0,
                          fit_intercept=fit_intercept)
        else:
            sklog = skLog(penalty='l2',
                          solver='lbfgs',
                          C=1e9,
                          fit_intercept=fit_intercept)

    sklog.fit(X_train, y_train)

    preds = culog.predict(X_test)
    skpreds = sklog.predict(X_test)

    # Setting tolerance to lowest possible per loss to detect regressions
    # as much as possible
    if penalty in ['elasticnet', 'l1', 'l2']:
        assert np.sum(preds.to_array() != skpreds) / 20000 < 1e-1
    else:
        # This is the only case where cuml and sklearn actually do a similar
        # lbfgs, other cases cuml does owl or sklearn does saga
        assert np.sum(preds.to_array() != skpreds) / 20000 < 1e-3