def test_logistic_regressioncv_class_weights():
    X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
                               n_classes=3, random_state=0)

    msg = ("In LogisticRegressionCV the liblinear solver cannot handle "
           "multiclass with class_weight of type dict. Use the lbfgs, "
           "newton-cg or sag solvers or set class_weight='balanced'")
    clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2},
                                   solver='liblinear')
    assert_raise_message(ValueError, msg, clf_lib.fit, X, y)
    y_ = y.copy()
    y_[y == 2] = 1
    clf_lib.fit(X, y_)
    assert_array_equal(clf_lib.classes_, [0, 1])

    # Test for class_weight=balanced
    X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
                               random_state=0)
    clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False,
                                   class_weight='balanced')
    clf_lbf.fit(X, y)
    clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False,
                                   class_weight='balanced')
    clf_lib.fit(X, y)
    clf_sag = LogisticRegressionCV(solver='sag', fit_intercept=False,
                                   class_weight='balanced', max_iter=2000)
    clf_sag.fit(X, y)
    assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
    assert_array_almost_equal(clf_sag.coef_, clf_lbf.coef_, decimal=4)
    assert_array_almost_equal(clf_lib.coef_, clf_sag.coef_, decimal=4)
def test_logistic_regressioncv_class_weights():
    X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
                               n_classes=3, random_state=0)

    # Test the liblinear fails when class_weight of type dict is
    # provided, when it is multiclass. However it can handle
    # binary problems.
    clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2},
                                   solver='liblinear')
    assert_raises(ValueError, clf_lib.fit, X, y)
    y_ = y.copy()
    y_[y == 2] = 1
    clf_lib.fit(X, y_)
    assert_array_equal(clf_lib.classes_, [0, 1])

    # Test for class_weight=auto
    X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
                               random_state=0)
    clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False,
                                   class_weight='auto')
    clf_lbf.fit(X, y)
    clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False,
                                   class_weight='auto')
    clf_lib.fit(X, y)
    assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
def test_make_classification():
    weights = [0.1, 0.25]
    X, y = make_classification(n_samples=100, n_features=20, n_informative=5,
                               n_redundant=1, n_repeated=1, n_classes=3,
                               n_clusters_per_class=1, hypercube=False,
                               shift=None, scale=None, weights=weights,
                               random_state=0)

    assert_equal(weights, [0.1, 0.25])
    assert_equal(X.shape, (100, 20), "X shape mismatch")
    assert_equal(y.shape, (100,), "y shape mismatch")
    assert_equal(np.unique(y).shape, (3,), "Unexpected number of classes")
    assert_equal(sum(y == 0), 10, "Unexpected number of samples in class #0")
    assert_equal(sum(y == 1), 25, "Unexpected number of samples in class #1")
    assert_equal(sum(y == 2), 65, "Unexpected number of samples in class #2")

    # Test for n_features > 30
    X, y = make_classification(n_samples=2000, n_features=31, n_informative=31,
                               n_redundant=0, n_repeated=0, hypercube=True,
                               scale=0.5, random_state=0)

    assert_equal(X.shape, (2000, 31), "X shape mismatch")
    assert_equal(y.shape, (2000,), "y shape mismatch")
    assert_equal(np.unique(X.view([('', X.dtype)]*X.shape[1])).view(X.dtype)
                 .reshape(-1, X.shape[1]).shape[0], 2000,
                 "Unexpected number of unique rows")
    def setUp(self):
        np.random.seed(488881)
        # binomial
        x, y = make_classification(n_samples=300, random_state=6601)
        x_sparse = csr_matrix(x)

        x_wide, y_wide = make_classification(n_samples=100, n_features=150,
                                             random_state=8911)
        x_wide_sparse = csr_matrix(x_wide)
        self.binomial = [(x, y), (x_sparse, y), (x_wide, y_wide),
                         (x_wide_sparse, y_wide)]

        # multinomial
        x, y = make_classification(n_samples=400, n_classes=3, n_informative=15,
                                   n_features=25, random_state=10585)
        x_sparse = csr_matrix(x)

        x_wide, y_wide = make_classification(n_samples=400, n_classes=3,
                                             n_informative=15, n_features=500,
                                             random_state=15841)
        x_wide_sparse = csr_matrix(x_wide)
        self.multinomial = [(x, y), (x_sparse, y), (x_wide, y_wide),
                            (x_wide_sparse, y_wide)]

        self.alphas = [0., 0.25, 0.50, 0.75, 1.]
        self.n_splits = [-1, 0, 5]
        self.scoring = [
            "accuracy",
            "roc_auc",
            "average_precision",
            "log_loss",
            "precision_macro",
            "precision_micro",
            "precision_weighted",
            "f1_micro",
            "f1_macro",
            "f1_weighted",
        ]
        self.multinomial_scoring = [
            "accuracy",
            "log_loss",
            "precision_macro",
            "precision_micro",
            "precision_weighted",
            "f1_micro",
            "f1_macro",
            "f1_weighted"
        ]
def test_grid_search_precomputed_kernel_error_kernel_function():
    """Test that grid search returns an error when using a kernel_function"""
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)
    kernel_function = lambda x1, x2: np.dot(x1, x2.T)
    clf = SVC(kernel=kernel_function)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    assert_raises(ValueError, cv.fit, X_, y_)
Beispiel #6
0
def test_importances():
    """Check variable importances."""
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    for name, Tree in CLF_TREES.items():
        clf = Tree(random_state=0)

        clf.fit(X, y)
        importances = clf.feature_importances_
        n_important = np.sum(importances > 0.1)

        assert_equal(importances.shape[0], 10, "Failed with {0}".format(name))
        assert_equal(n_important, 3, "Failed with {0}".format(name))

        X_new = clf.transform(X, threshold="mean")
        assert_less(0, X_new.shape[1], "Failed with {0}".format(name))
        assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name))

    # Check on iris that importances are the same for all builders
    clf = DecisionTreeClassifier(random_state=0)
    clf.fit(iris.data, iris.target)
    clf2 = DecisionTreeClassifier(random_state=0,
                                  max_leaf_nodes=len(iris.data))
    clf2.fit(iris.data, iris.target)

    assert_array_equal(clf.feature_importances_,
                       clf2.feature_importances_)
def test_deprecated_score_func():
    # test that old deprecated way of passing a score / loss function is still
    # supported
    X, y = make_classification(n_samples=200, n_features=100, random_state=0)
    clf = LinearSVC(random_state=0)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X[:180], y[:180])
    y_pred = cv.predict(X[180:])
    C = cv.best_estimator_.C

    clf = LinearSVC(random_state=0)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score)
    with warnings.catch_warnings(record=True):
        # catch deprecation warning
        cv.fit(X[:180], y[:180])
    y_pred_func = cv.predict(X[180:])
    C_func = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred_func)
    assert_equal(C, C_func)

    # test loss where greater is worse
    def f1_loss(y_true_, y_pred_):
        return -f1_score(y_true_, y_pred_)

    clf = LinearSVC(random_state=0)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, loss_func=f1_loss)
    with warnings.catch_warnings(record=True):
        # catch deprecation warning
        cv.fit(X[:180], y[:180])
    y_pred_loss = cv.predict(X[180:])
    C_loss = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred_loss)
    assert_equal(C, C_loss)
def test_grid_search_precomputed_kernel():
    """Test that grid search works when the input features are given in the
    form of a precomputed kernel matrix """
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    # compute the training kernel matrix corresponding to the linear kernel
    K_train = np.dot(X_[:180], X_[:180].T)
    y_train = y_[:180]

    clf = SVC(kernel='precomputed')
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(K_train, y_train)

    assert_true(cv.best_score_ >= 0)

    # compute the test kernel matrix
    K_test = np.dot(X_[180:], X_[:180].T)
    y_test = y_[180:]

    y_pred = cv.predict(K_test)

    assert_true(np.mean(y_pred == y_test) >= 0)

    # test error is raised when the precomputed kernel is not array-like
    # or sparse
    assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def test_partial_dependence_unknown_feature(estimator, features):
    X, y = make_classification(random_state=0)
    estimator.fit(X, y)

    err_msg = 'all features must be in'
    with pytest.raises(ValueError, match=err_msg):
        partial_dependence(estimator, X, [features])
def test_grid_search_sparse_scoring():
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1")
    cv.fit(X_[:180], y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert_array_equal(y_pred, y_pred2)
    assert_equal(C, C2)
    # Smoke test the score
    #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]),
    #                        cv.score(X_[:180], y[:180]))

    # test loss where greater is worse
    def f1_loss(y_true_, y_pred_):
        return -f1_score(y_true_, y_pred_)
    F1Loss = make_scorer(f1_loss, greater_is_better=False)
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss)
    cv.fit(X_[:180], y_[:180])
    y_pred3 = cv.predict(X_[180:])
    C3 = cv.best_estimator_.C

    assert_equal(C, C3)
    assert_array_equal(y_pred, y_pred3)
Beispiel #11
0
def test_logreg_l1_sparse_data():
    # Because liblinear penalizes the intercept and saga does not, we do not
    # fit the intercept to make it possible to compare the coefficients of
    # the two models at convergence.
    rng = np.random.RandomState(42)
    n_samples = 50
    X, y = make_classification(n_samples=n_samples, n_features=20,
                               random_state=0)
    X_noise = rng.normal(scale=0.1, size=(n_samples, 3))
    X_constant = np.zeros(shape=(n_samples, 2))
    X = np.concatenate((X, X_noise, X_constant), axis=1)
    X[X < 1] = 0
    X = sparse.csr_matrix(X)

    lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear',
                                      fit_intercept=False,
                                      tol=1e-10)
    lr_liblinear.fit(X, y)

    lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                 fit_intercept=False,
                                 max_iter=1000, tol=1e-10)
    lr_saga.fit(X, y)
    assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_)
    # Noise and constant features should be regularized to zero by the l1
    # penalty
    assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5))
    assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5))

    # Check that solving on the sparse and dense data yield the same results
    lr_saga_dense = LogisticRegression(penalty="l1", C=1.0, solver='saga',
                                       fit_intercept=False,
                                       max_iter=1000, tol=1e-10)
    lr_saga_dense.fit(X.toarray(), y)
    assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_)
Beispiel #12
0
def test_linearsvc_parameters():
    # Test possible parameter combinations in LinearSVC
    # Generate list of possible parameter combinations
    losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo']
    penalties, duals = ['l1', 'l2', 'bar'], [True, False]

    X, y = make_classification(n_samples=5, n_features=5)

    for loss, penalty, dual in itertools.product(losses, penalties, duals):
        clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual)
        if ((loss, penalty) == ('hinge', 'l1') or
                (loss, penalty, dual) == ('hinge', 'l2', False) or
                (penalty, dual) == ('l1', True) or
                loss == 'foo' or penalty == 'bar'):

            assert_raises_regexp(ValueError,
                                 "Unsupported set of arguments.*penalty='%s.*"
                                 "loss='%s.*dual=%s"
                                 % (penalty, loss, dual),
                                 clf.fit, X, y)
        else:
            clf.fit(X, y)

    # Incorrect loss value - test if explicit error message is raised
    assert_raises_regexp(ValueError, ".*loss='l3' is not supported.*",
                         svm.LinearSVC(loss="l3").fit, X, y)
Beispiel #13
0
def test_logistic_regression_solvers():
    X, y = make_classification(n_features=10, n_informative=5, random_state=0)

    ncg = LogisticRegression(solver='newton-cg', fit_intercept=False)
    lbf = LogisticRegression(solver='lbfgs', fit_intercept=False)
    lib = LogisticRegression(fit_intercept=False)
    sag = LogisticRegression(solver='sag', fit_intercept=False,
                             random_state=42)
    saga = LogisticRegression(solver='saga', fit_intercept=False,
                              random_state=42)
    ncg.fit(X, y)
    lbf.fit(X, y)
    sag.fit(X, y)
    saga.fit(X, y)
    lib.fit(X, y)
    assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3)
    assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3)
    assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(saga.coef_, sag.coef_, decimal=3)
    assert_array_almost_equal(saga.coef_, lbf.coef_, decimal=3)
    assert_array_almost_equal(saga.coef_, ncg.coef_, decimal=3)
    assert_array_almost_equal(saga.coef_, lib.coef_, decimal=3)
Beispiel #14
0
def test_logistic_regression_solvers_multiclass():
    X, y = make_classification(n_samples=20, n_features=20, n_informative=10,
                               n_classes=3, random_state=0)
    tol = 1e-7
    ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol)
    lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol)
    lib = LogisticRegression(fit_intercept=False, tol=tol)
    sag = LogisticRegression(solver='sag', fit_intercept=False, tol=tol,
                             max_iter=1000, random_state=42)
    saga = LogisticRegression(solver='saga', fit_intercept=False, tol=tol,
                              max_iter=10000, random_state=42)
    ncg.fit(X, y)
    lbf.fit(X, y)
    sag.fit(X, y)
    saga.fit(X, y)
    lib.fit(X, y)
    assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4)
    assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4)
    assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(saga.coef_, sag.coef_, decimal=4)
    assert_array_almost_equal(saga.coef_, lbf.coef_, decimal=4)
    assert_array_almost_equal(saga.coef_, ncg.coef_, decimal=4)
    assert_array_almost_equal(saga.coef_, lib.coef_, decimal=4)
Beispiel #15
0
def test_logistic_loss_and_grad():
    X_ref, y = make_classification(n_samples=20, random_state=0)
    n_features = X_ref.shape[1]

    X_sp = X_ref.copy()
    X_sp[X_sp < .1] = 0
    X_sp = sp.csr_matrix(X_sp)
    for X in (X_ref, X_sp):
        w = np.zeros(n_features)

        # First check that our derivation of the grad is correct
        loss, grad = _logistic_loss_and_grad(w, X, y, alpha=1.)
        approx_grad = optimize.approx_fprime(
            w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3
        )
        assert_array_almost_equal(grad, approx_grad, decimal=2)

        # Second check that our intercept implementation is good
        w = np.zeros(n_features + 1)
        loss_interp, grad_interp = _logistic_loss_and_grad(
            w, X, y, alpha=1.
        )
        assert_array_almost_equal(loss, loss_interp)

        approx_grad = optimize.approx_fprime(
            w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3
        )
        assert_array_almost_equal(grad_interp, approx_grad, decimal=2)
Beispiel #16
0
def test_intercept_logistic_helper():
    n_samples, n_features = 10, 5
    X, y = make_classification(n_samples=n_samples, n_features=n_features,
                               random_state=0)

    # Fit intercept case.
    alpha = 1.
    w = np.ones(n_features + 1)
    grad_interp, hess_interp = _logistic_grad_hess(w, X, y, alpha)
    loss_interp = _logistic_loss(w, X, y, alpha)

    # Do not fit intercept. This can be considered equivalent to adding
    # a feature vector of ones, i.e column of one vectors.
    X_ = np.hstack((X, np.ones(10)[:, np.newaxis]))
    grad, hess = _logistic_grad_hess(w, X_, y, alpha)
    loss = _logistic_loss(w, X_, y, alpha)

    # In the fit_intercept=False case, the feature vector of ones is
    # penalized. This should be taken care of.
    assert_almost_equal(loss_interp + 0.5 * (w[-1] ** 2), loss)

    # Check gradient.
    assert_array_almost_equal(grad_interp[:n_features], grad[:n_features])
    assert_almost_equal(grad_interp[-1] + alpha * w[-1], grad[-1])

    rng = np.random.RandomState(0)
    grad = rng.rand(n_features + 1)
    hess_interp = hess_interp(grad)
    hess = hess(grad)
    assert_array_almost_equal(hess_interp[:n_features], hess[:n_features])
    assert_almost_equal(hess_interp[-1] + alpha * grad[-1], hess[-1])
Beispiel #17
0
def main():
    (X,y) = skd.make_classification()
    N = X.shape[0]
    X = np.append(X,np.ones((N,1)),axis=1)
    y = 2*y-1
        
    skf = StratifiedKFold(y,5)
    for train,test in skf:
        X_train = X[train,:]
        y_train = y[train]
        
        X_test = X[test,:]
        y_test = y[test]
        
        C = 0.01
        
        # dual co-ordinate descent SVM
        clf = SVMCD(C)
        clf.fit(X_train,y_train,w_prior=np.ones(21))
        pred = clf.decision_function(X_test)
        score = clf.score(X_test,y_test)
        fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
        print score, metrics.auc(fpr, tpr), "//",
        w1  = clf.w;
        
        # standard svm
        clf = SVC(C=C,kernel='linear')
        clf.fit(X_train, y_train) 
        pred = clf.decision_function(X_test)
        score = clf.score(X_test,y_test)
        fpr, tpr, thresholds = metrics.roc_curve(y_test, pred)
        print score, metrics.auc(fpr, tpr)
        w2 = clf.coef_
        w2.shape = (21,)
def test_logistic_regression_convergence_warnings():
    """Test that warnings are raised if model does not converge"""

    X, y = make_classification(n_samples=20, n_features=20)
    clf_lib = LogisticRegression(solver='liblinear', max_iter=2, verbose=1)
    assert_warns(ConvergenceWarning, clf_lib.fit, X, y)
    assert_equal(clf_lib.n_iter_, 2)
def test_gradient_boosting_validation_fraction():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=100,
                                     n_iter_no_change=10,
                                     validation_fraction=0.1,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)
    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
    gbc3 = clone(gbc).set_params(n_iter_no_change=20)

    gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    validation_fraction=0.1,
                                    random_state=42)
    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
    gbr3 = clone(gbr).set_params(n_iter_no_change=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # Check if validation_fraction has an effect
    gbc.fit(X_train, y_train)
    gbc2.fit(X_train, y_train)
    assert gbc.n_estimators_ != gbc2.n_estimators_

    gbr.fit(X_train, y_train)
    gbr2.fit(X_train, y_train)
    assert gbr.n_estimators_ != gbr2.n_estimators_

    # Check if n_estimators_ increase monotonically with n_iter_no_change
    # Set validation
    gbc3.fit(X_train, y_train)
    gbr3.fit(X_train, y_train)
    assert gbr.n_estimators_ < gbr3.n_estimators_
    assert gbc.n_estimators_ < gbc3.n_estimators_
def test_liblinear_random_state():
    X, y = make_classification(n_samples=20)
    lr1 = LogisticRegression(random_state=0)
    lr1.fit(X, y)
    lr2 = LogisticRegression(random_state=0)
    lr2.fit(X, y)
    assert_array_almost_equal(lr1.coef_, lr2.coef_)
def test_grid_search_failing_classifier():
    # GridSearchCV with on_error != 'raise'
    # Ensures that a warning is raised and score reset where appropriate.

    X, y = make_classification(n_samples=20, n_features=10, random_state=0)

    clf = FailingClassifier()

    # refit=False because we only want to check that errors caused by fits
    # to individual folds will be caught and warnings raised instead. If
    # refit was done, then an exception would be raised on refit and not
    # caught by grid_search (expected behavior), and this would cause an
    # error in this test.
    gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
                      refit=False, error_score=0.0)

    assert_warns(FitFailedWarning, gs.fit, X, y)

    # Ensure that grid scores were set to zero as required for those fits
    # that are expected to fail.
    assert all(np.all(this_point.cv_validation_scores == 0.0)
               for this_point in gs.grid_scores_
               if this_point.parameters['parameter'] ==
               FailingClassifier.FAILING_PARAMETER)

    gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy',
                      refit=False, error_score=float('nan'))
    assert_warns(FitFailedWarning, gs.fit, X, y)
    assert all(np.all(np.isnan(this_point.cv_validation_scores))
               for this_point in gs.grid_scores_
               if this_point.parameters['parameter'] ==
               FailingClassifier.FAILING_PARAMETER)
def test_class_weight_auto_classifiers():
    """Test that class_weight="auto" improves f1-score"""

    # This test is broken; its success depends on:
    # * a rare fortuitous RNG seed for make_classification; and
    # * the use of binary F1 over a seemingly arbitrary positive class for two
    #   datasets, and weighted average F1 for the third.
    # Its expectations need to be clarified and reimplemented.
    raise SkipTest("This test requires redefinition")

    classifiers = all_estimators(type_filter="classifier")

    clean_warning_registry()
    with warnings.catch_warnings(record=True):
        classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()]

    for n_classes, weights in zip([2, 3], [[0.8, 0.2], [0.8, 0.1, 0.1]]):
        # create unbalanced dataset
        X, y = make_classification(
            n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes
        )
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
        for name, Classifier in classifiers:
            if (
                name != "NuSVC"
                # the sparse version has a parameter that doesn't do anything
                and not name.startswith("RidgeClassifier")
                # RidgeClassifier behaves unexpected
                # FIXME!
                and not name.endswith("NB")
            ):
                # NaiveBayes classifiers have a somewhat different interface.
                # FIXME SOON!
                yield (check_class_weight_auto_classifiers, name, Classifier, X_train, y_train, X_test, y_test, weights)
def test_gradient_boosting_early_stopping():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=1000,
                                     n_iter_no_change=10,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)

    gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=42)
    # Check if early_stopping works as expected
    for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13),
                                              (gbc, 1e-3, 36),
                                              (gbr, 1e-3, 28)):
        est.set_params(tol=tol)
        est.fit(X_train, y_train)
        assert_equal(est.n_estimators_, early_stop_n_estimators)
        assert est.score(X_test, y_test) > 0.7

    # Without early stopping
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                     max_depth=3, random_state=42)
    gbc.fit(X, y)
    gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
                                    max_depth=3, random_state=42)
    gbr.fit(X, y)

    assert gbc.n_estimators_ == 100
    assert gbr.n_estimators_ == 200
    def test_cvbestsearchrefit_select_k_best(self):
        list_C_value = range(2, 10, 1)
        #        print repr(list_C_value)
        for C_value in list_C_value:
            #            C_value = 2
            #            print C_value
            X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5)
            n_folds_nested = 2
            # random_state = 0
            k_values = [2, 3, 4, 5, 6]
            key_y_pred = "y" + conf.SEP + conf.PREDICTION
            # With EPAC
            methods = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C_value, kernel="linear")) for k in k_values])
            wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
            wf.run(X=X, y=y)
            r_epac = wf.reduce().values()[0]
            # - Without EPAC
            from sklearn.pipeline import Pipeline

            r_sklearn = dict()
            clf = Pipeline([("anova", SelectKBest(k=3)), ("svm", SVC(C=C_value, kernel="linear"))])
            parameters = {"anova__k": k_values}
            cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
            gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
            gscv.fit(X, y)
            r_sklearn[key_y_pred] = gscv.predict(X)
            r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
            r_sklearn[conf.BEST_PARAMS]["k"] = r_sklearn[conf.BEST_PARAMS]["anova__k"]
            # - Comparisons
            comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
            self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: prediction")
            for key_param in r_epac[conf.BEST_PARAMS][0]:
                if key_param in r_sklearn[conf.BEST_PARAMS]:
                    comp = r_sklearn[conf.BEST_PARAMS][key_param] == r_epac[conf.BEST_PARAMS][0][key_param]
                    self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: best parameters")
    def test_cv(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
        n_folds = 2

        # = With EPAC
        wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True))
        r_epac = wf.top_down(X=X, y=y)

        # = With SKLEARN
        clf = SVC(kernel="linear")
        r_sklearn = list()
        for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds):
            # idx_train, idx_test  = cv.__iter__().next()
            X_train = X[idx_train, :]
            X_test = X[idx_test, :]
            y_train = y[idx_train, :]
            clf.fit(X_train, y_train)
            r_sklearn.append(clf.predict(X_test))

        # = Comparison
        key2cmp = "y" + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION
        for icv in range(n_folds):
            comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0]))
            self.assertTrue(comp, u"Diff CV: EPAC vs sklearn")

        # test reduce
        r_epac_reduce = wf.reduce().values()[0][key2cmp]
        comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn))
        self.assertTrue(comp, u"Diff CV: EPAC reduce")
    def test_perm2(self):
        from epac.tests.wfexamples2test import WFExample2

        X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
        wf = WFExample2().get_workflow()
        wf.run(X=X, y=y)
        wf.reduce()
 def test_cvbestsearchrefit(self):
     X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2)
     n_folds_nested = 2
     # random_state = 0
     C_values = [0.1, 0.5, 1, 2, 5]
     kernels = ["linear", "rbf"]
     key_y_pred = "y" + conf.SEP + conf.PREDICTION
     # With EPAC
     methods = Methods(*[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels])
     wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
     wf.run(X=X, y=y)
     r_epac = wf.reduce().values()[0]
     # - Without EPAC
     r_sklearn = dict()
     clf = SVC(kernel="linear")
     parameters = {"C": C_values, "kernel": kernels}
     cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
     gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
     gscv.fit(X, y)
     r_sklearn[key_y_pred] = gscv.predict(X)
     r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
     # - Comparisons
     comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
     self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: prediction")
     for key_param in r_epac[conf.BEST_PARAMS][0]:
         if key_param in r_sklearn[conf.BEST_PARAMS]:
             comp = r_sklearn[conf.BEST_PARAMS][key_param] == r_epac[conf.BEST_PARAMS][0][key_param]
             self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: best parameters")
    def test_perm(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
        n_perms = 2
        rnd = 0
        # = With EPAC
        wf = Perms(SVC(kernel="linear"), n_perms=n_perms, permute="y", random_state=rnd, reducer=None)
        r_epac = wf.top_down(X=X, y=y)
        # = With SKLEARN
        clf = SVC(kernel="linear")
        r_sklearn = list()
        for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd):
            y_p = y[perm, :]
            clf.fit(X, y_p)
            r_sklearn.append(clf.predict(X))
        key2cmp = "y" + conf.SEP + conf.PREDICTION

        # = Comparison
        for iperm in range(n_perms):
            comp = np.all(np.asarray(r_epac[iperm][key2cmp]) == np.asarray(r_sklearn[iperm]))
            self.assertTrue(comp, u"Diff Perm: EPAC vs sklearn")
        # test reduce
        for iperm in range(n_perms):
            r_epac_reduce = wf.reduce().values()[iperm][key2cmp]
            comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn[iperm]))
            self.assertTrue(comp, u"Diff Perm: EPAC reduce")
 def test_engine_info(self):
     n_samples = 20
     n_features = 100
     n_proc = 2
     X, y = datasets.make_classification(n_samples=n_samples,
                                         n_features=n_features,
                                         n_informative=2,
                                         random_state=1)
     Xy = dict(X=X, y=y)
     cv_svm_local = CV(Methods(*[SVC(kernel="linear"),
                                 SVC(kernel="rbf")]),
                       n_folds=3)
     swf_engine = SomaWorkflowEngine(cv_svm_local,
                                     num_processes=n_proc,
                                     resource_id="jl237561@gabriel",
                                     login="******",
                                     remove_finished_wf=False,
                                     remove_local_tree=False,
                                     queue="Global_long")
     swf_engine.run(**Xy)
     print "engine_info ================"
     for job_info in swf_engine.engine_info:
         print "  job_info================="
         print "  mem_cost= ", job_info.mem_cost
         print "  vmem_cost= ", job_info.vmem_cost
         print "  time_cost= ", job_info.time_cost
         self.assertTrue(job_info.time_cost > 0)
def test_grid_search_score_method():
    X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2,
                               random_state=0)
    clf = LinearSVC(random_state=0)
    grid = {'C': [.1]}

    search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y)
    search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y)
    search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid,
                                              scoring='roc_auc').fit(X, y)
    search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y)

    # ChangedBehaviourWarning occurred previously (prior to #9005)
    score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y)
    score_accuracy = assert_no_warnings(search_accuracy.score, X, y)
    score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score,
                                            X, y)
    score_auc = assert_no_warnings(search_auc.score, X, y)

    # ensure the test is sane
    assert_true(score_auc < 1.0)
    assert_true(score_accuracy < 1.0)
    assert_not_equal(score_auc, score_accuracy)

    assert_almost_equal(score_accuracy, score_no_scoring)
    assert_almost_equal(score_auc, score_no_score_auc)
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable
]

figure = plt.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
Beispiel #32
0
 def test_classif_ten_classes(self, NeuralNet):
     X, y = make_classification(n_classes=10, n_informative=10)
     X = X.astype(floatX)
     y = y.astype(np.int32)
     self.classif(NeuralNet, X, y)
# Authors: Christos Aridas
#          Guillaume Lemaitre <*****@*****.**>
# License: MIT

import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA

from imblearn.over_sampling import ADASYN

print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=200, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ada = ADASYN()
X_resampled, y_resampled = ada.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)
from sklearn.datasets import make_classification

features, target = make_classification(n_samples=100,
                                       n_features=3,
                                       n_informative=3,
                                       n_redundant=0,
                                       n_classes=2,
                                       weights=[.25, .75],
                                       random_state=1)

print('\n Features Matrix: ', features[:3])
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

X, y = make_classification(n_samples=16, n_features=2,
                           n_informative=2, n_redundant=0,
                           random_state=0)

model = LogisticRegression().fit(X, y)
y_hat = model.predict(X)
f_value = model.decision_function(X)

df = pd.DataFrame(np.vstack([f_value, y_hat, y]).T,
                  columns=["f", "y_hat", "y"])
df.sort_values("f", ascending=False).reset_index(drop=True)

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y, model.decision_function(X))
fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y, model.predict_proba(X)[:, 1])
# feature별 importance 매핑
for name, value in zip(iris_data.feature_names , dt_clf.feature_importances_):
    print('{0} : {1:.3f}'.format(name, value))

# feature importance를 column 별로 시각화 하기 
sns.barplot(x=dt_clf.feature_importances_ , y=iris_data.feature_names)

from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
%matplotlib inline

plt.title("3 Class values with 2 Features Sample data creation")

# 2차원 시각화를 위해서 feature는 2개, 결정값 클래스는 3가지 유형의 classification 샘플 데이터 생성. 
X_features, y_labels = make_classification(n_features=2, n_redundant=0, n_informative=2,
                             n_classes=3, n_clusters_per_class=1,random_state=0)

# plot 형태로 2개의 feature로 2차원 좌표 시각화, 각 클래스값은 다른 색깔로 표시됨. 
plt.scatter(X_features[:, 0], X_features[:, 1], marker='o', c=y_labels, s=25, cmap='rainbow', edgecolor='k')

import numpy as np

# Classifier의 Decision Boundary를 시각화 하는 함수
def visualize_boundary(model, X, y):
    fig,ax = plt.subplots()
    
    # 학습 데이타 scatter plot으로 나타내기
    ax.scatter(X[:, 0], X[:, 1], c=y, s=25, cmap='rainbow', edgecolor='k',
               clim=(y.min(), y.max()), zorder=3)
    ax.axis('tight')
    ax.axis('off')
Beispiel #37
0
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline

# https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#example-ensemble-plot-feature-transformation-py

n_estimator = 10
X, y = make_classification(n_samples=80000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

# It is important to train the ensemble of trees on a different subset
# of the training data than the linear regression model to avoid
# overfitting, in particular if the total number of leaves is
# similar to the number of training samples
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                            y_train,
                                                            test_size=0.5)

# Unsupervised transformation based on totally random trees
rt = RandomTreesEmbedding(max_depth=3,
                          n_estimators=n_estimator,
                          random_state=0)
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.neighbors import NeighborhoodComponentsAnalysis
from matplotlib import cm
from sklearn.utils.fixes import logsumexp

print(__doc__)

n_neighbors = 1
random_state = 0

# Create a tiny data set of 9 samples from 3 classes
X, y = make_classification(n_samples=9,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           n_classes=3,
                           n_clusters_per_class=1,
                           class_sep=1.0,
                           random_state=random_state)

# Plot the points in the original space
plt.figure()
ax = plt.gca()

# Draw the graph nodes
for i in range(X.shape[0]):
    ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center')
    ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4)


def p_i(X, i):
Beispiel #39
0
def test_random_search_cv_results(params):
    # Make a dataset with a lot of noise to get various kind of prediction
    # errors across CV folds and parameter settings
    X, y = make_classification(n_samples=200,
                               n_features=100,
                               n_informative=3,
                               random_state=0)

    # scipy.stats dists now supports `seed` but we still support scipy 0.12
    # which doesn't support the seed. Hence the assertions in the test for
    # random_search alone should not depend on randomization.
    n_splits = 3
    n_search_iter = 30
    random_search = dcv.RandomizedSearchCV(
        SVC(),
        n_iter=n_search_iter,
        cv=n_splits,
        iid=False,
        param_distributions=params,
        return_train_score=True,
    )
    random_search.fit(X, y)
    random_search_iid = dcv.RandomizedSearchCV(
        SVC(),
        n_iter=n_search_iter,
        cv=n_splits,
        iid=True,
        param_distributions=params,
        return_train_score=True,
    )
    random_search_iid.fit(X, y)

    param_keys = ("param_C", "param_gamma")
    score_keys = (
        "mean_test_score",
        "mean_train_score",
        "rank_test_score",
        "split0_test_score",
        "split1_test_score",
        "split2_test_score",
        "split0_train_score",
        "split1_train_score",
        "split2_train_score",
        "std_test_score",
        "std_train_score",
        "mean_fit_time",
        "std_fit_time",
        "mean_score_time",
        "std_score_time",
    )
    n_cand = n_search_iter

    for search, iid in zip((random_search, random_search_iid), (False, True)):
        assert iid == search.iid
        cv_results = search.cv_results_
        # Check results structure
        check_cv_results_array_types(cv_results, param_keys, score_keys)
        check_cv_results_keys(cv_results, param_keys, score_keys, n_cand)
        # For random_search, all the param array vals should be unmasked
        assert not (any(cv_results["param_C"].mask)
                    or any(cv_results["param_gamma"].mask))
Beispiel #40
0
 def test_classif_two_classes(self, NeuralNet):
     X, y = make_classification()
     X = X.astype(floatX)
     y = y.astype(np.int32)
     self.classif(NeuralNet, X, y)
def test_calibration():
    """Test calibration objects with isotonic and sigmoid"""
    n_samples = 100
    X, y = make_classification(n_samples=2 * n_samples, n_features=6,
                               random_state=42)
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test, y_test = X[n_samples:], y[n_samples:]

    # Naive-Bayes
    clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1)
    assert_raises(ValueError, pc_clf.fit, X, y)

    # Naive Bayes with calibration
    for this_X_train, this_X_test in [(X_train, X_test),
                                      (sparse.csr_matrix(X_train),
                                       sparse.csr_matrix(X_test))]:
        for method in ['isotonic', 'sigmoid']:
            pc_clf = CalibratedClassifierCV(clf, method=method, cv=2)
            # Note that this fit overwrites the fit on the entire training
            # set
            pc_clf.fit(this_X_train, y_train, sample_weight=sw_train)
            prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1]

            # Check that brier score has improved after calibration
            assert_greater(brier_score_loss(y_test, prob_pos_clf),
                           brier_score_loss(y_test, prob_pos_pc_clf))

            # Check invariance against relabeling [0, 1] -> [1, 2]
            pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [-1, 1]
            pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [1, 0]
            pc_clf.fit(this_X_train, (y_train + 1) % 2,
                       sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = \
                pc_clf.predict_proba(this_X_test)[:, 1]
            if method == "sigmoid":
                assert_array_almost_equal(prob_pos_pc_clf,
                                          1 - prob_pos_pc_clf_relabeled)
            else:
                # Isotonic calibration is not invariant against relabeling
                # but should improve in both cases
                assert_greater(brier_score_loss(y_test, prob_pos_clf),
                               brier_score_loss((y_test + 1) % 2,
                                                prob_pos_pc_clf_relabeled))

        # check that calibration can also deal with regressors that have
        # a decision_function
        clf_base_regressor = CalibratedClassifierCV(Ridge())
        clf_base_regressor.fit(X_train, y_train)
        clf_base_regressor.predict(X_test)

        # Check failure cases:
        # only "isotonic" and "sigmoid" should be accepted as methods
        clf_invalid_method = CalibratedClassifierCV(clf, method="foo")
        assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train)

        # base-estimators should provide either decision_function or
        # predict_proba (most regressors, for instance, should fail)
        clf_base_regressor = \
            CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid")
        assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
Beispiel #42
0
def cls_data():
    X, Y = datasets.make_classification(1000, 10, n_classes=2, random_state=11)
    X = X.astype(np.float64)
    Y = Y.astype(np.float64).reshape(-1, 1)
    Y[Y == 0] = -1
    return torch.from_numpy(X), torch.from_numpy(Y)
nb_samples = 100
nb_unlabeled = 75
tolerance = 0.01


def rbf(x1, x2, gamma=10.0):
    n = np.linalg.norm(x1 - x2, ord=1)
    return np.exp(-gamma * np.power(n, 2))


if __name__ == '__main__':
    # Create the dataset
    X, Y = make_classification(n_samples=nb_samples,
                               n_features=2,
                               n_informative=2,
                               n_redundant=0,
                               random_state=1000)
    Y[Y == 0] = -1
    Y[nb_samples - nb_unlabeled:nb_samples] = 0

    # Show the original dataset
    sns.set()
    fig, ax = plt.subplots(figsize=(12, 9))

    ax.scatter(X[Y == -1, 0],
               X[Y == -1, 1],
               color='r',
               marker='s',
               s=150,
               label="Class 0")
Beispiel #44
0
import numpy as np
from sklearn.datasets import make_classification
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from matplotlib.colors import ListedColormap
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report

N =200

# 訓練データ生成
dat = make_classification(n_samples=N, n_features=2, n_informative=2, n_redundant=0, n_classes=2, n_clusters_per_class=1, weights=[0.7, 0.3], flip_y=0.01, shuffle=True, random_state=200)
X = dat[0]
y = dat[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

ppn = Perceptron(n_iter=1, eta0=0.1)
ppn.fit(X_train, y_train)

y_pred = ppn.predict(X_test)

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
target_names = ['0', '1']
print(classification_report(y_test, y_pred, target_names=target_names))

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    #setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('r', 'b', 'g', 'y', 'm')
import pandas as pd
import numpy as np
# import sklearn
# print('sklearn: %s' % sklearn.__version__)

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

# create and configure model

X, y = make_classification(n_samples=16,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           random_state=0)

# model = LogisticRegression().fit(X, y)
model = LogisticRegression(solver='liblinear').fit(X, y)
# model = LogisticRegression(solver='lbfgs').fit(X, y)
y_hat = model.predict(X)
f_value = model.decision_function(X)

df = pd.DataFrame(np.vstack([f_value, y_hat, y]).T,
                  columns=["f", "y_hat", "y"])
df.sort_values("f", ascending=False).reset_index(drop=True)

print(df)
Beispiel #46
0
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_moons
from sklearn.datasets import make_gaussian_quantiles

data = make_classification(n_samples=3000,
                           n_features=200,
                           n_informative=40,
                           n_redundant=10,
                           n_repeated=5,
                           class_sep=1.0,
                           shuffle=False,
                           flip_y=.02,
                           random_state=147)

feat_gt = np.asarray([*([True] * 55), *([False] * 145)])
pd.to_pickle((data[0], data[1], feat_gt), 'hc_bl.pkl')

# MOON
moon_data = make_moons(n_samples=4000)

feat_gt = np.asarray([*([True] * 2), *([False] * 0)])

# pickled tuple: (X, y, feat_ground_truth)
pd.to_pickle((moon_data[0], moon_data[1], feat_gt), 'hcm_1.pkl')

moon_data = make_moons(n_samples=3000, noise=0.32)
moon_data = (np.hstack(
    (moon_data[0], moon_data[0] *
     [np.random.rand(), np.random.randint(1e9)],
     np.vstack(moon_data[0][:, 0] * np.random.rand()))), moon_data[1])
Beispiel #47
0
fm=2*(precision*recall/(precision+recall))
print(fm)




def plot_roc_curve(rfpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

data_X, class_label = make_classification(n_samples=1000, n_classes=2, weights=[1, 1], random_state=1)

trainX, testX, trainy, testy = train_test_split(data_X, class_label, test_size=0.3, random_state=1)

model = RandomForestClassifier()
model.fit(trainX, trainy)

probs = model.predict_proba(testX)

probs = probs[:, 1]

auc = roc_auc_score(testy, probs)
print('AUC: %.2f' % auc)

fpr, tpr, thresholds = roc_curve(testy, probs)
Beispiel #48
0
# for ind, i in enumerate(tree_params['criterion']):
#     plt.plot(tree_params['max_depth'], scores[ind], label=str(i))
# plt.legend(loc='best')
# plt.xlabel('max_depth')
# plt.ylabel('Accuracy')
# plt.show()


# clf = DecisionTree(max_depth=9, criterion='entropy')
# clf.fit(X_train, y_train)
# probs = clf.predict_proba(X_test)
# mean_probs = np.mean(probs, axis=0)
# print(mean_probs)


X, y = make_classification(n_features=2, n_redundant=0, n_samples=400,
                          random_state=RANDOM_STATE)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                   random_state=RANDOM_STATE)

clf = DecisionTree(max_depth=4, criterion='gini')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
prob_pred = clf.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

if (sum(np.argmax(prob_pred,axis=1) - y_pred) == 0):
    print('predict_proba works!')
Beispiel #49
0
    assert_allclose(pca_full.explained_variance_,
                    pca_other.explained_variance_,
                    rtol=5e-2)
    assert_allclose(
        pca_full.explained_variance_ratio_,
        pca_other.explained_variance_ratio_,
        rtol=5e-2,
    )


@pytest.mark.parametrize(
    "X",
    [
        np.random.RandomState(0).randn(100, 80),
        datasets.make_classification(100, 80, n_informative=78,
                                     random_state=0)[0],
    ],
    ids=["random-data", "correlated-data"],
)
@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
def test_pca_explained_variance_empirical(X, svd_solver):
    pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0)
    X_pca = pca.fit_transform(X)
    assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0))

    expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0]
    expected_result = sorted(expected_result, reverse=True)[:2]
    assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3)


@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
Beispiel #50
0
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 12 23:31:18 2020

@author: Jie.Hu
"""



# mean shift clustering
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.cluster import MeanShift

# define dataset
X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4)
# define the model
model = MeanShift()
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = np.unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
	# get row indexes for samples with this cluster
	row_ix = np.where(yhat == cluster)
	# create scatter of these samples
	plt.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
plt.show()
A recursive feature elimination example with automatic tuning of the
number of features selected with cross-validation.
"""
print(__doc__)

import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC

# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=25,
                           n_informative=3,
                           n_redundant=2,
                           n_repeated=0,
                           n_classes=8,
                           n_clusters_per_class=1,
                           random_state=0)

# Create the RFE object and compute a cross-validated score.
svc = SVC(kernel="linear")
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
Beispiel #52
0
def test_metrics():
    X, y = make_classification(n_samples=500,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=2)

    X = StandardScaler().fit(X).transform(X)

    model = SKLogisticRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, 1]
    y_true = y

    print('Confusion Matrix:',
          confusion_matrix(y_true,
                           y_pred) == metrics.confusion_matrix(y_true, y_pred),
          sep='\n')
    print('Delta Accuracy:',
          accuracy(y_true, y_pred) - metrics.accuracy_score(y_true, y_pred))

    print(
        'Delta Micro Recall:',
        recall(y_true, y_true, kind='micro') -
        metrics.recall_score(y_true, y_pred, average='micro'))
    print(
        'Delta Micro Precision:',
        precision(y_true, y_pred, kind='micro') -
        metrics.precision_score(y_true, y_pred, average='micro'))
    print(
        'Delta Micro F1-score:',
        f1_score(y_true, y_pred, kind='micro') -
        metrics.f1_score(y_true, y_pred, average='micro'))

    print(
        'Delta Macro Recall:',
        recall(y_true, y_true, kind='macro') -
        metrics.recall_score(y_true, y_pred, average='macro'))
    print(
        'Delta Macro Precision:',
        precision(y_true, y_pred, kind='macro') -
        metrics.precision_score(y_true, y_pred, average='macro'))
    print(
        'Delta Macro F1-score:',
        f1_score(y_true, y_pred, kind='macro') -
        metrics.f1_score(y_true, y_pred, average='macro'))

    print(
        'Delta All Recall:',
        recall(y_true, y_true, kind='all') -
        metrics.recall_score(y_true, y_pred, average=None))
    print(
        'Delta All Precision:',
        precision(y_true, y_pred, kind='all') -
        metrics.precision_score(y_true, y_pred, average=None))
    print(
        'Delta All F1-score:',
        f1_score(y_true, y_pred, kind='all') -
        metrics.f1_score(y_true, y_pred, average=None))

    print('Delta log loss:',
          log_loss_score(y_true, y_proba) - metrics.log_loss(y_true, y_proba))
    print(
        'Delta zero one loss:',
        zero_one_loss(y_true, y_true) - metrics.zero_one_loss(y_true, y_true))

    print('*' * 80)

    X, y = make_regression(n_samples=500,
                           n_features=5,
                           n_informative=5,
                           n_targets=1)
    model = SKLinearRegression()
    model.fit(X, y)
    y_pred = model.predict(X)
    y_true = y

    print(
        'Delta mean_absolute_error:',
        mean_absolute_error(y_true, y_pred) -
        metrics.mean_absolute_error(y_true, y_pred))
    print(
        'Delta mean_squared_error:',
        mean_squared_error(y_true, y_pred) -
        metrics.mean_squared_error(y_true, y_pred))
    print('Delta r2_score:',
          r2_score(y_true, y_pred) - metrics.r2_score(y_true, y_pred))
def test_logistic_regression_sample_weights():
    X, y = make_classification(n_samples=20, n_features=5, n_informative=3,
                               n_classes=2, random_state=0)
    sample_weight = y + 1

    for LR in [LogisticRegression, LogisticRegressionCV]:

        # Test that passing sample_weight as ones is the same as
        # not passing them at all (default None)
        for solver in ['lbfgs', 'liblinear']:
            clf_sw_none = LR(solver=solver, fit_intercept=False,
                             random_state=42)
            clf_sw_none.fit(X, y)
            clf_sw_ones = LR(solver=solver, fit_intercept=False,
                             random_state=42)
            clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0]))
            assert_array_almost_equal(
                clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4)

        # Test that sample weights work the same with the lbfgs,
        # newton-cg, and 'sag' solvers
        clf_sw_lbfgs = LR(solver='lbfgs', fit_intercept=False, random_state=42)
        clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight)
        clf_sw_n = LR(solver='newton-cg', fit_intercept=False, random_state=42)
        clf_sw_n.fit(X, y, sample_weight=sample_weight)
        clf_sw_sag = LR(solver='sag', fit_intercept=False, tol=1e-10,
                        random_state=42)
        # ignore convergence warning due to small dataset
        with ignore_warnings():
            clf_sw_sag.fit(X, y, sample_weight=sample_weight)
        clf_sw_liblinear = LR(solver='liblinear', fit_intercept=False,
                              random_state=42)
        clf_sw_liblinear.fit(X, y, sample_weight=sample_weight)
        assert_array_almost_equal(
            clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4)
        assert_array_almost_equal(
            clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4)
        assert_array_almost_equal(
            clf_sw_lbfgs.coef_, clf_sw_liblinear.coef_, decimal=4)

        # Test that passing class_weight as [1,2] is the same as
        # passing class weight = [1,1] but adjusting sample weights
        # to be 2 for all instances of class 2
        for solver in ['lbfgs', 'liblinear']:
            clf_cw_12 = LR(solver=solver, fit_intercept=False,
                           class_weight={0: 1, 1: 2}, random_state=42)
            clf_cw_12.fit(X, y)
            clf_sw_12 = LR(solver=solver, fit_intercept=False, random_state=42)
            clf_sw_12.fit(X, y, sample_weight=sample_weight)
            assert_array_almost_equal(
                clf_cw_12.coef_, clf_sw_12.coef_, decimal=4)

    # Test the above for l1 penalty and l2 penalty with dual=True.
    # since the patched liblinear code is different.
    clf_cw = LogisticRegression(
        solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2},
        penalty="l1", tol=1e-5, random_state=42)
    clf_cw.fit(X, y)
    clf_sw = LogisticRegression(
        solver="liblinear", fit_intercept=False, penalty="l1", tol=1e-5,
        random_state=42)
    clf_sw.fit(X, y, sample_weight)
    assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)

    clf_cw = LogisticRegression(
        solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2},
        penalty="l2", dual=True, random_state=42)
    clf_cw.fit(X, y)
    clf_sw = LogisticRegression(
        solver="liblinear", fit_intercept=False, penalty="l2", dual=True,
        random_state=42)
    clf_sw.fit(X, y, sample_weight)
    assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)
def test_saga_sparse():
    # Test LogRegCV with solver='liblinear' works for sparse matrices

    X, y = make_classification(n_samples=10, n_features=5, random_state=0)
    clf = LogisticRegressionCV(solver='saga')
    clf.fit(sparse.csr_matrix(X), y)
Beispiel #55
0
            if train_predictions_1[i] == train_predictions_2[i]:
                newX_l.append(X_unlabeled[i])
                newy_l.append(train_predictions_1[i])
                dele.append(i)
        newX_l = np.array(newX_l)
        newy_l = np.array(newy_l)
        if newX_l.shape[0] == 0:
            break
        X_labeled = np.append(X_labeled, newX_l, axis=0)
        y_labeled = np.append(y_labeled, newy_l)
        X_unlabeled = np.delete(X_unlabeled, dele, axis=0)
    return X_labeled, y_labeled


# Read data
X, y = make_classification(n_samples=2000, n_features=2, n_redundant=0,
                           n_informative=2, n_clusters_per_class=2)
plt.figure()
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)


# Normalization
X_norm = preprocessing.scale(X)

num_labeled = 25
num_unlabeled = [0, 10, 20, 40, 80, 160, 320, 640, 1280]

err_lda = {}
err_clustering = {}
err_co = {}
log_lda = {}
log_clustering = {}
Beispiel #56
0
import seaborn as sns

NFEAT = 8
NINFO = 5
NRED = 0
NREP = 0
NCLASS = 3
NCLUSTCLASS = 1

nUnrelated = NFEAT - NINFO - NRED - NREP

[X, Y] = dat.make_classification(n_samples=100,
                                 n_features=NFEAT,
                                 n_informative=NINFO,
                                 n_redundant=NRED,
                                 n_repeated=NREP,
                                 n_classes=NCLASS,
                                 n_clusters_per_class=NCLUSTCLASS,
                                 class_sep=5,
                                 shuffle=False)

nUseful = NINFO + NRED + NREP
print("Useful features: first {}".format(NINFO))

sns.set()
binwidth = 1

fig1, sub = plt.subplots(2, int(NFEAT / 2))
fig1.subplots_adjust(wspace=0.6, hspace=0.6)
colors = ['b', 'r', 'g', 'y', 'c']
classes = []
Beispiel #57
0
def dataset():
    X, y = make_classification(100, 5, random_state=42)
    X = X.astype(np.float64)
    y = y.astype(np.float64)
    return X, y
Beispiel #58
0
        if alpha >= self.H:
            return self.H
        elif alpha <= self.L:
            return self.L
        else:
            return alpha

    def K(self, x_i, x_j):
        return np.dot(x_i, x_j)


if __name__ == '__main__':

    X, Y = make_classification(n_features=2,
                               n_informative=2,
                               n_redundant=0,
                               n_repeated=0,
                               n_classes=2,
                               n_clusters_per_class=1)
    #X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60)
    Y = [pow(-1, num + 1) for num in Y]

    s = SMO(X, Y, 1, 100)
    w = s.w()
    b = s.b

    x1 = max(X, key=lambda x: x[0])
    x2 = min(X, key=lambda x: x[0])
    slope = -w[0] / w[1]
    intercept = -b / w[1]

    print(slope)
Beispiel #59
0
def test_grid_search_cv_results():
    X, y = make_classification(n_samples=50, n_features=4, random_state=42)

    n_splits = 3
    n_grid_points = 6
    params = [
        dict(kernel=["rbf"], C=[1, 10], gamma=[0.1, 1]),
        dict(kernel=["poly"], degree=[1, 2]),
    ]
    grid_search = dcv.GridSearchCV(
        SVC(gamma="auto"),
        cv=n_splits,
        iid=False,
        param_grid=params,
        return_train_score=True,
    )
    grid_search.fit(X, y)
    grid_search_iid = dcv.GridSearchCV(
        SVC(gamma="auto"),
        cv=n_splits,
        iid=True,
        param_grid=params,
        return_train_score=True,
    )
    grid_search_iid.fit(X, y)

    param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel")
    score_keys = (
        "mean_test_score",
        "mean_train_score",
        "rank_test_score",
        "split0_test_score",
        "split1_test_score",
        "split2_test_score",
        "split0_train_score",
        "split1_train_score",
        "split2_train_score",
        "std_test_score",
        "std_train_score",
        "mean_fit_time",
        "std_fit_time",
        "mean_score_time",
        "std_score_time",
    )
    n_candidates = n_grid_points

    for search, iid in zip((grid_search, grid_search_iid), (False, True)):
        assert iid == search.iid
        cv_results = search.cv_results_
        # Check if score and timing are reasonable
        assert all(cv_results["rank_test_score"] >= 1)
        assert all(
            all(cv_results[k] >= 0) for k in score_keys
            if k != "rank_test_score")
        assert all(
            all(cv_results[k] <= 1) for k in score_keys
            if "time" not in k and k != "rank_test_score")
        # Check cv_results structure
        check_cv_results_array_types(cv_results, param_keys, score_keys)
        check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates)
        # Check masking
        cv_results = grid_search.cv_results_
        n_candidates = len(grid_search.cv_results_["params"])
        assert all((
            cv_results["param_C"].mask[i] and cv_results["param_gamma"].mask[i]
            and not cv_results["param_degree"].mask[i])
                   for i in range(n_candidates)
                   if cv_results["param_kernel"][i] == "linear")
        assert all((not cv_results["param_C"].mask[i]
                    and not cv_results["param_gamma"].mask[i]
                    and cv_results["param_degree"].mask[i])
                   for i in range(n_candidates)
                   if cv_results["param_kernel"][i] == "rbf")
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (brier_score_loss, precision_score, recall_score,
                             f1_score)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.cross_validation import train_test_split

# Create dataset of classification task with many redundant and few
# informative features
X, y = datasets.make_classification(n_samples=100000,
                                    n_features=20,
                                    n_informative=2,
                                    n_redundant=10,
                                    random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.99,
                                                    random_state=42)


def plot_calibration_curve(est, name, fig_index):
    """Plot calibration curve for est w/o and with calibration. """
    # Calibrated with isotonic calibration
    isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic')

    # Calibrated with sigmoid calibration