def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) msg = ("In LogisticRegressionCV the liblinear solver cannot handle " "multiclass with class_weight of type dict. Use the lbfgs, " "newton-cg or sag solvers or set class_weight='balanced'") clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2}, solver='liblinear') assert_raise_message(ValueError, msg, clf_lib.fit, X, y) y_ = y.copy() y_[y == 2] = 1 clf_lib.fit(X, y_) assert_array_equal(clf_lib.classes_, [0, 1]) # Test for class_weight=balanced X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='balanced') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='balanced') clf_lib.fit(X, y) clf_sag = LogisticRegressionCV(solver='sag', fit_intercept=False, class_weight='balanced', max_iter=2000) clf_sag.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_sag.coef_, clf_lbf.coef_, decimal=4) assert_array_almost_equal(clf_lib.coef_, clf_sag.coef_, decimal=4)
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) # Test the liblinear fails when class_weight of type dict is # provided, when it is multiclass. However it can handle # binary problems. clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2}, solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y) y_ = y.copy() y_[y == 2] = 1 clf_lib.fit(X, y_) assert_array_equal(clf_lib.classes_, [0, 1]) # Test for class_weight=auto X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='auto') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='auto') clf_lib.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
def test_make_classification(): weights = [0.1, 0.25] X, y = make_classification(n_samples=100, n_features=20, n_informative=5, n_redundant=1, n_repeated=1, n_classes=3, n_clusters_per_class=1, hypercube=False, shift=None, scale=None, weights=weights, random_state=0) assert_equal(weights, [0.1, 0.25]) assert_equal(X.shape, (100, 20), "X shape mismatch") assert_equal(y.shape, (100,), "y shape mismatch") assert_equal(np.unique(y).shape, (3,), "Unexpected number of classes") assert_equal(sum(y == 0), 10, "Unexpected number of samples in class #0") assert_equal(sum(y == 1), 25, "Unexpected number of samples in class #1") assert_equal(sum(y == 2), 65, "Unexpected number of samples in class #2") # Test for n_features > 30 X, y = make_classification(n_samples=2000, n_features=31, n_informative=31, n_redundant=0, n_repeated=0, hypercube=True, scale=0.5, random_state=0) assert_equal(X.shape, (2000, 31), "X shape mismatch") assert_equal(y.shape, (2000,), "y shape mismatch") assert_equal(np.unique(X.view([('', X.dtype)]*X.shape[1])).view(X.dtype) .reshape(-1, X.shape[1]).shape[0], 2000, "Unexpected number of unique rows")
def setUp(self): np.random.seed(488881) # binomial x, y = make_classification(n_samples=300, random_state=6601) x_sparse = csr_matrix(x) x_wide, y_wide = make_classification(n_samples=100, n_features=150, random_state=8911) x_wide_sparse = csr_matrix(x_wide) self.binomial = [(x, y), (x_sparse, y), (x_wide, y_wide), (x_wide_sparse, y_wide)] # multinomial x, y = make_classification(n_samples=400, n_classes=3, n_informative=15, n_features=25, random_state=10585) x_sparse = csr_matrix(x) x_wide, y_wide = make_classification(n_samples=400, n_classes=3, n_informative=15, n_features=500, random_state=15841) x_wide_sparse = csr_matrix(x_wide) self.multinomial = [(x, y), (x_sparse, y), (x_wide, y_wide), (x_wide_sparse, y_wide)] self.alphas = [0., 0.25, 0.50, 0.75, 1.] self.n_splits = [-1, 0, 5] self.scoring = [ "accuracy", "roc_auc", "average_precision", "log_loss", "precision_macro", "precision_micro", "precision_weighted", "f1_micro", "f1_macro", "f1_weighted", ] self.multinomial_scoring = [ "accuracy", "log_loss", "precision_macro", "precision_micro", "precision_weighted", "f1_micro", "f1_macro", "f1_weighted" ]
def test_grid_search_precomputed_kernel_error_kernel_function(): """Test that grid search returns an error when using a kernel_function""" X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) kernel_function = lambda x1, x2: np.dot(x1, x2.T) clf = SVC(kernel=kernel_function) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) assert_raises(ValueError, cv.fit, X_, y_)
def test_importances(): """Check variable importances.""" X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y) importances = clf.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10, "Failed with {0}".format(name)) assert_equal(n_important, 3, "Failed with {0}".format(name)) X_new = clf.transform(X, threshold="mean") assert_less(0, X_new.shape[1], "Failed with {0}".format(name)) assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name)) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) clf.fit(iris.data, iris.target) clf2 = DecisionTreeClassifier(random_state=0, max_leaf_nodes=len(iris.data)) clf2.fit(iris.data, iris.target) assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
def test_deprecated_score_func(): # test that old deprecated way of passing a score / loss function is still # supported X, y = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X[:180], y[:180]) y_pred = cv.predict(X[180:]) C = cv.best_estimator_.C clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) with warnings.catch_warnings(record=True): # catch deprecation warning cv.fit(X[:180], y[:180]) y_pred_func = cv.predict(X[180:]) C_func = cv.best_estimator_.C assert_array_equal(y_pred, y_pred_func) assert_equal(C, C_func) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, loss_func=f1_loss) with warnings.catch_warnings(record=True): # catch deprecation warning cv.fit(X[:180], y[:180]) y_pred_loss = cv.predict(X[180:]) C_loss = cv.best_estimator_.C assert_array_equal(y_pred, y_pred_loss) assert_equal(C, C_loss)
def test_grid_search_precomputed_kernel(): """Test that grid search works when the input features are given in the form of a precomputed kernel matrix """ X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel='precomputed') cv = GridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(K_train, y_train) assert_true(cv.best_score_ >= 0) # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) assert_true(np.mean(y_pred == y_test) >= 0) # test error is raised when the precomputed kernel is not array-like # or sparse assert_raises(ValueError, cv.fit, K_train.tolist(), y_train)
def test_partial_dependence_unknown_feature(estimator, features): X, y = make_classification(random_state=0) estimator.fit(X, y) err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features])
def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2) # Smoke test the score #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert_equal(C, C3) assert_array_equal(y_pred, y_pred3)
def test_logreg_l1_sparse_data(): # Because liblinear penalizes the intercept and saga does not, we do not # fit the intercept to make it possible to compare the coefficients of # the two models at convergence. rng = np.random.RandomState(42) n_samples = 50 X, y = make_classification(n_samples=n_samples, n_features=20, random_state=0) X_noise = rng.normal(scale=0.1, size=(n_samples, 3)) X_constant = np.zeros(shape=(n_samples, 2)) X = np.concatenate((X, X_noise, X_constant), axis=1) X[X < 1] = 0 X = sparse.csr_matrix(X) lr_liblinear = LogisticRegression(penalty="l1", C=1.0, solver='liblinear', fit_intercept=False, tol=1e-10) lr_liblinear.fit(X, y) lr_saga = LogisticRegression(penalty="l1", C=1.0, solver='saga', fit_intercept=False, max_iter=1000, tol=1e-10) lr_saga.fit(X, y) assert_array_almost_equal(lr_saga.coef_, lr_liblinear.coef_) # Noise and constant features should be regularized to zero by the l1 # penalty assert_array_almost_equal(lr_liblinear.coef_[0, -5:], np.zeros(5)) assert_array_almost_equal(lr_saga.coef_[0, -5:], np.zeros(5)) # Check that solving on the sparse and dense data yield the same results lr_saga_dense = LogisticRegression(penalty="l1", C=1.0, solver='saga', fit_intercept=False, max_iter=1000, tol=1e-10) lr_saga_dense.fit(X.toarray(), y) assert_array_almost_equal(lr_saga.coef_, lr_saga_dense.coef_)
def test_linearsvc_parameters(): # Test possible parameter combinations in LinearSVC # Generate list of possible parameter combinations losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo'] penalties, duals = ['l1', 'l2', 'bar'], [True, False] X, y = make_classification(n_samples=5, n_features=5) for loss, penalty, dual in itertools.product(losses, penalties, duals): clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual) if ((loss, penalty) == ('hinge', 'l1') or (loss, penalty, dual) == ('hinge', 'l2', False) or (penalty, dual) == ('l1', True) or loss == 'foo' or penalty == 'bar'): assert_raises_regexp(ValueError, "Unsupported set of arguments.*penalty='%s.*" "loss='%s.*dual=%s" % (penalty, loss, dual), clf.fit, X, y) else: clf.fit(X, y) # Incorrect loss value - test if explicit error message is raised assert_raises_regexp(ValueError, ".*loss='l3' is not supported.*", svm.LinearSVC(loss="l3").fit, X, y)
def test_logistic_regression_solvers(): X, y = make_classification(n_features=10, n_informative=5, random_state=0) ncg = LogisticRegression(solver='newton-cg', fit_intercept=False) lbf = LogisticRegression(solver='lbfgs', fit_intercept=False) lib = LogisticRegression(fit_intercept=False) sag = LogisticRegression(solver='sag', fit_intercept=False, random_state=42) saga = LogisticRegression(solver='saga', fit_intercept=False, random_state=42) ncg.fit(X, y) lbf.fit(X, y) sag.fit(X, y) saga.fit(X, y) lib.fit(X, y) assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=3) assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(sag.coef_, lib.coef_, decimal=3) assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=3) assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(saga.coef_, sag.coef_, decimal=3) assert_array_almost_equal(saga.coef_, lbf.coef_, decimal=3) assert_array_almost_equal(saga.coef_, ncg.coef_, decimal=3) assert_array_almost_equal(saga.coef_, lib.coef_, decimal=3)
def test_logistic_regression_solvers_multiclass(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) tol = 1e-7 ncg = LogisticRegression(solver='newton-cg', fit_intercept=False, tol=tol) lbf = LogisticRegression(solver='lbfgs', fit_intercept=False, tol=tol) lib = LogisticRegression(fit_intercept=False, tol=tol) sag = LogisticRegression(solver='sag', fit_intercept=False, tol=tol, max_iter=1000, random_state=42) saga = LogisticRegression(solver='saga', fit_intercept=False, tol=tol, max_iter=10000, random_state=42) ncg.fit(X, y) lbf.fit(X, y) sag.fit(X, y) saga.fit(X, y) lib.fit(X, y) assert_array_almost_equal(ncg.coef_, lib.coef_, decimal=4) assert_array_almost_equal(lib.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(ncg.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(sag.coef_, lib.coef_, decimal=4) assert_array_almost_equal(sag.coef_, ncg.coef_, decimal=4) assert_array_almost_equal(sag.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(saga.coef_, sag.coef_, decimal=4) assert_array_almost_equal(saga.coef_, lbf.coef_, decimal=4) assert_array_almost_equal(saga.coef_, ncg.coef_, decimal=4) assert_array_almost_equal(saga.coef_, lib.coef_, decimal=4)
def test_logistic_loss_and_grad(): X_ref, y = make_classification(n_samples=20, random_state=0) n_features = X_ref.shape[1] X_sp = X_ref.copy() X_sp[X_sp < .1] = 0 X_sp = sp.csr_matrix(X_sp) for X in (X_ref, X_sp): w = np.zeros(n_features) # First check that our derivation of the grad is correct loss, grad = _logistic_loss_and_grad(w, X, y, alpha=1.) approx_grad = optimize.approx_fprime( w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3 ) assert_array_almost_equal(grad, approx_grad, decimal=2) # Second check that our intercept implementation is good w = np.zeros(n_features + 1) loss_interp, grad_interp = _logistic_loss_and_grad( w, X, y, alpha=1. ) assert_array_almost_equal(loss, loss_interp) approx_grad = optimize.approx_fprime( w, lambda w: _logistic_loss_and_grad(w, X, y, alpha=1.)[0], 1e-3 ) assert_array_almost_equal(grad_interp, approx_grad, decimal=2)
def test_intercept_logistic_helper(): n_samples, n_features = 10, 5 X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0) # Fit intercept case. alpha = 1. w = np.ones(n_features + 1) grad_interp, hess_interp = _logistic_grad_hess(w, X, y, alpha) loss_interp = _logistic_loss(w, X, y, alpha) # Do not fit intercept. This can be considered equivalent to adding # a feature vector of ones, i.e column of one vectors. X_ = np.hstack((X, np.ones(10)[:, np.newaxis])) grad, hess = _logistic_grad_hess(w, X_, y, alpha) loss = _logistic_loss(w, X_, y, alpha) # In the fit_intercept=False case, the feature vector of ones is # penalized. This should be taken care of. assert_almost_equal(loss_interp + 0.5 * (w[-1] ** 2), loss) # Check gradient. assert_array_almost_equal(grad_interp[:n_features], grad[:n_features]) assert_almost_equal(grad_interp[-1] + alpha * w[-1], grad[-1]) rng = np.random.RandomState(0) grad = rng.rand(n_features + 1) hess_interp = hess_interp(grad) hess = hess(grad) assert_array_almost_equal(hess_interp[:n_features], hess[:n_features]) assert_almost_equal(hess_interp[-1] + alpha * grad[-1], hess[-1])
def main(): (X,y) = skd.make_classification() N = X.shape[0] X = np.append(X,np.ones((N,1)),axis=1) y = 2*y-1 skf = StratifiedKFold(y,5) for train,test in skf: X_train = X[train,:] y_train = y[train] X_test = X[test,:] y_test = y[test] C = 0.01 # dual co-ordinate descent SVM clf = SVMCD(C) clf.fit(X_train,y_train,w_prior=np.ones(21)) pred = clf.decision_function(X_test) score = clf.score(X_test,y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, pred) print score, metrics.auc(fpr, tpr), "//", w1 = clf.w; # standard svm clf = SVC(C=C,kernel='linear') clf.fit(X_train, y_train) pred = clf.decision_function(X_test) score = clf.score(X_test,y_test) fpr, tpr, thresholds = metrics.roc_curve(y_test, pred) print score, metrics.auc(fpr, tpr) w2 = clf.coef_ w2.shape = (21,)
def test_logistic_regression_convergence_warnings(): """Test that warnings are raised if model does not converge""" X, y = make_classification(n_samples=20, n_features=20) clf_lib = LogisticRegression(solver='liblinear', max_iter=2, verbose=1) assert_warns(ConvergenceWarning, clf_lib.fit, X, y) assert_equal(clf_lib.n_iter_, 2)
def test_gradient_boosting_validation_fraction(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=100, n_iter_no_change=10, validation_fraction=0.1, learning_rate=0.1, max_depth=3, random_state=42) gbc2 = clone(gbc).set_params(validation_fraction=0.3) gbc3 = clone(gbc).set_params(n_iter_no_change=20) gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10, learning_rate=0.1, max_depth=3, validation_fraction=0.1, random_state=42) gbr2 = clone(gbr).set_params(validation_fraction=0.3) gbr3 = clone(gbr).set_params(n_iter_no_change=20) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if validation_fraction has an effect gbc.fit(X_train, y_train) gbc2.fit(X_train, y_train) assert gbc.n_estimators_ != gbc2.n_estimators_ gbr.fit(X_train, y_train) gbr2.fit(X_train, y_train) assert gbr.n_estimators_ != gbr2.n_estimators_ # Check if n_estimators_ increase monotonically with n_iter_no_change # Set validation gbc3.fit(X_train, y_train) gbr3.fit(X_train, y_train) assert gbr.n_estimators_ < gbr3.n_estimators_ assert gbc.n_estimators_ < gbc3.n_estimators_
def test_liblinear_random_state(): X, y = make_classification(n_samples=20) lr1 = LogisticRegression(random_state=0) lr1.fit(X, y) lr2 = LogisticRegression(random_state=0) lr2.fit(X, y) assert_array_almost_equal(lr1.coef_, lr2.coef_)
def test_grid_search_failing_classifier(): # GridSearchCV with on_error != 'raise' # Ensures that a warning is raised and score reset where appropriate. X, y = make_classification(n_samples=20, n_features=10, random_state=0) clf = FailingClassifier() # refit=False because we only want to check that errors caused by fits # to individual folds will be caught and warnings raised instead. If # refit was done, then an exception would be raised on refit and not # caught by grid_search (expected behavior), and this would cause an # error in this test. gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', refit=False, error_score=0.0) assert_warns(FitFailedWarning, gs.fit, X, y) # Ensure that grid scores were set to zero as required for those fits # that are expected to fail. assert all(np.all(this_point.cv_validation_scores == 0.0) for this_point in gs.grid_scores_ if this_point.parameters['parameter'] == FailingClassifier.FAILING_PARAMETER) gs = GridSearchCV(clf, [{'parameter': [0, 1, 2]}], scoring='accuracy', refit=False, error_score=float('nan')) assert_warns(FitFailedWarning, gs.fit, X, y) assert all(np.all(np.isnan(this_point.cv_validation_scores)) for this_point in gs.grid_scores_ if this_point.parameters['parameter'] == FailingClassifier.FAILING_PARAMETER)
def test_class_weight_auto_classifiers(): """Test that class_weight="auto" improves f1-score""" # This test is broken; its success depends on: # * a rare fortuitous RNG seed for make_classification; and # * the use of binary F1 over a seemingly arbitrary positive class for two # datasets, and weighted average F1 for the third. # Its expectations need to be clarified and reimplemented. raise SkipTest("This test requires redefinition") classifiers = all_estimators(type_filter="classifier") clean_warning_registry() with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for n_classes, weights in zip([2, 3], [[0.8, 0.2], [0.8, 0.1, 0.1]]): # create unbalanced dataset X, y = make_classification( n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes ) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) for name, Classifier in classifiers: if ( name != "NuSVC" # the sparse version has a parameter that doesn't do anything and not name.startswith("RidgeClassifier") # RidgeClassifier behaves unexpected # FIXME! and not name.endswith("NB") ): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! yield (check_class_weight_auto_classifiers, name, Classifier, X_train, y_train, X_test, y_test, weights)
def test_gradient_boosting_early_stopping(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if early_stopping works as expected for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13), (gbc, 1e-3, 36), (gbr, 1e-3, 28)): est.set_params(tol=tol) est.fit(X_train, y_train) assert_equal(est.n_estimators_, early_stop_n_estimators) assert est.score(X_test, y_test) > 0.7 # Without early stopping gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) gbc.fit(X, y) gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42) gbr.fit(X, y) assert gbc.n_estimators_ == 100 assert gbr.n_estimators_ == 200
def test_cvbestsearchrefit_select_k_best(self): list_C_value = range(2, 10, 1) # print repr(list_C_value) for C_value in list_C_value: # C_value = 2 # print C_value X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5) n_folds_nested = 2 # random_state = 0 k_values = [2, 3, 4, 5, 6] key_y_pred = "y" + conf.SEP + conf.PREDICTION # With EPAC methods = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C_value, kernel="linear")) for k in k_values]) wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested) wf.run(X=X, y=y) r_epac = wf.reduce().values()[0] # - Without EPAC from sklearn.pipeline import Pipeline r_sklearn = dict() clf = Pipeline([("anova", SelectKBest(k=3)), ("svm", SVC(C=C_value, kernel="linear"))]) parameters = {"anova__k": k_values} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ r_sklearn[conf.BEST_PARAMS]["k"] = r_sklearn[conf.BEST_PARAMS]["anova__k"] # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: prediction") for key_param in r_epac[conf.BEST_PARAMS][0]: if key_param in r_sklearn[conf.BEST_PARAMS]: comp = r_sklearn[conf.BEST_PARAMS][key_param] == r_epac[conf.BEST_PARAMS][0][key_param] self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: best parameters")
def test_cv(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_folds = 2 # = With EPAC wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True)) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN clf = SVC(kernel="linear") r_sklearn = list() for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds): # idx_train, idx_test = cv.__iter__().next() X_train = X[idx_train, :] X_test = X[idx_test, :] y_train = y[idx_train, :] clf.fit(X_train, y_train) r_sklearn.append(clf.predict(X_test)) # = Comparison key2cmp = "y" + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION for icv in range(n_folds): comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0])) self.assertTrue(comp, u"Diff CV: EPAC vs sklearn") # test reduce r_epac_reduce = wf.reduce().values()[0][key2cmp] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn)) self.assertTrue(comp, u"Diff CV: EPAC reduce")
def test_perm2(self): from epac.tests.wfexamples2test import WFExample2 X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) wf = WFExample2().get_workflow() wf.run(X=X, y=y) wf.reduce()
def test_cvbestsearchrefit(self): X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2) n_folds_nested = 2 # random_state = 0 C_values = [0.1, 0.5, 1, 2, 5] kernels = ["linear", "rbf"] key_y_pred = "y" + conf.SEP + conf.PREDICTION # With EPAC methods = Methods(*[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels]) wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested) wf.run(X=X, y=y) r_epac = wf.reduce().values()[0] # - Without EPAC r_sklearn = dict() clf = SVC(kernel="linear") parameters = {"C": C_values, "kernel": kernels} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: prediction") for key_param in r_epac[conf.BEST_PARAMS][0]: if key_param in r_sklearn[conf.BEST_PARAMS]: comp = r_sklearn[conf.BEST_PARAMS][key_param] == r_epac[conf.BEST_PARAMS][0][key_param] self.assertTrue(comp, u"Diff CVBestSearchRefitParallel: best parameters")
def test_perm(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_perms = 2 rnd = 0 # = With EPAC wf = Perms(SVC(kernel="linear"), n_perms=n_perms, permute="y", random_state=rnd, reducer=None) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN clf = SVC(kernel="linear") r_sklearn = list() for perm in Permutations(n=y.shape[0], n_perms=n_perms, random_state=rnd): y_p = y[perm, :] clf.fit(X, y_p) r_sklearn.append(clf.predict(X)) key2cmp = "y" + conf.SEP + conf.PREDICTION # = Comparison for iperm in range(n_perms): comp = np.all(np.asarray(r_epac[iperm][key2cmp]) == np.asarray(r_sklearn[iperm])) self.assertTrue(comp, u"Diff Perm: EPAC vs sklearn") # test reduce for iperm in range(n_perms): r_epac_reduce = wf.reduce().values()[iperm][key2cmp] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn[iperm])) self.assertTrue(comp, u"Diff Perm: EPAC reduce")
def test_engine_info(self): n_samples = 20 n_features = 100 n_proc = 2 X, y = datasets.make_classification(n_samples=n_samples, n_features=n_features, n_informative=2, random_state=1) Xy = dict(X=X, y=y) cv_svm_local = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3) swf_engine = SomaWorkflowEngine(cv_svm_local, num_processes=n_proc, resource_id="jl237561@gabriel", login="******", remove_finished_wf=False, remove_local_tree=False, queue="Global_long") swf_engine.run(**Xy) print "engine_info ================" for job_info in swf_engine.engine_info: print " job_info=================" print " mem_cost= ", job_info.mem_cost print " vmem_cost= ", job_info.vmem_cost print " time_cost= ", job_info.time_cost self.assertTrue(job_info.time_cost > 0)
def test_grid_search_score_method(): X, y = make_classification(n_samples=100, n_classes=2, flip_y=.2, random_state=0) clf = LinearSVC(random_state=0) grid = {'C': [.1]} search_no_scoring = GridSearchCV(clf, grid, scoring=None).fit(X, y) search_accuracy = GridSearchCV(clf, grid, scoring='accuracy').fit(X, y) search_no_score_method_auc = GridSearchCV(LinearSVCNoScore(), grid, scoring='roc_auc').fit(X, y) search_auc = GridSearchCV(clf, grid, scoring='roc_auc').fit(X, y) # ChangedBehaviourWarning occurred previously (prior to #9005) score_no_scoring = assert_no_warnings(search_no_scoring.score, X, y) score_accuracy = assert_no_warnings(search_accuracy.score, X, y) score_no_score_auc = assert_no_warnings(search_no_score_method_auc.score, X, y) score_auc = assert_no_warnings(search_auc.score, X, y) # ensure the test is sane assert_true(score_auc < 1.0) assert_true(score_accuracy < 1.0) assert_not_equal(score_auc, score_accuracy) assert_almost_equal(score_accuracy, score_no_scoring) assert_almost_equal(score_auc, score_no_score_auc)
classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis() ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable ] figure = plt.figure(figsize=(27, 9)) i = 1 # iterate over datasets for ds_cnt, ds in enumerate(datasets):
def test_classif_ten_classes(self, NeuralNet): X, y = make_classification(n_classes=10, n_informative=10) X = X.astype(floatX) y = y.astype(np.int32) self.classif(NeuralNet, X, y)
# Authors: Christos Aridas # Guillaume Lemaitre <*****@*****.**> # License: MIT import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.decomposition import PCA from imblearn.over_sampling import ADASYN print(__doc__) # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=200, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random over-sampling ada = ADASYN() X_resampled, y_resampled = ada.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2)
from sklearn.datasets import make_classification features, target = make_classification(n_samples=100, n_features=3, n_informative=3, n_redundant=0, n_classes=2, weights=[.25, .75], random_state=1) print('\n Features Matrix: ', features[:3])
from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.ensemble import AdaBoostClassifier from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold from sklearn.pipeline import Pipeline import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression X, y = make_classification(n_samples=16, n_features=2, n_informative=2, n_redundant=0, random_state=0) model = LogisticRegression().fit(X, y) y_hat = model.predict(X) f_value = model.decision_function(X) df = pd.DataFrame(np.vstack([f_value, y_hat, y]).T, columns=["f", "y_hat", "y"]) df.sort_values("f", ascending=False).reset_index(drop=True) from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y, model.decision_function(X)) fpr, tpr, thresholds fpr, tpr, thresholds = roc_curve(y, model.predict_proba(X)[:, 1])
# feature별 importance 매핑 for name, value in zip(iris_data.feature_names , dt_clf.feature_importances_): print('{0} : {1:.3f}'.format(name, value)) # feature importance를 column 별로 시각화 하기 sns.barplot(x=dt_clf.feature_importances_ , y=iris_data.feature_names) from sklearn.datasets import make_classification import matplotlib.pyplot as plt %matplotlib inline plt.title("3 Class values with 2 Features Sample data creation") # 2차원 시각화를 위해서 feature는 2개, 결정값 클래스는 3가지 유형의 classification 샘플 데이터 생성. X_features, y_labels = make_classification(n_features=2, n_redundant=0, n_informative=2, n_classes=3, n_clusters_per_class=1,random_state=0) # plot 형태로 2개의 feature로 2차원 좌표 시각화, 각 클래스값은 다른 색깔로 표시됨. plt.scatter(X_features[:, 0], X_features[:, 1], marker='o', c=y_labels, s=25, cmap='rainbow', edgecolor='k') import numpy as np # Classifier의 Decision Boundary를 시각화 하는 함수 def visualize_boundary(model, X, y): fig,ax = plt.subplots() # 학습 데이타 scatter plot으로 나타내기 ax.scatter(X[:, 0], X[:, 1], c=y, s=25, cmap='rainbow', edgecolor='k', clim=(y.min(), y.max()), zorder=3) ax.axis('tight') ax.axis('off')
import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier, GradientBoostingClassifier) from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.metrics import roc_curve from sklearn.pipeline import make_pipeline # https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#example-ensemble-plot-feature-transformation-py n_estimator = 10 X, y = make_classification(n_samples=80000) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(max_depth=3, n_estimators=n_estimator, random_state=0)
import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.neighbors import NeighborhoodComponentsAnalysis from matplotlib import cm from sklearn.utils.fixes import logsumexp print(__doc__) n_neighbors = 1 random_state = 0 # Create a tiny data set of 9 samples from 3 classes X, y = make_classification(n_samples=9, n_features=2, n_informative=2, n_redundant=0, n_classes=3, n_clusters_per_class=1, class_sep=1.0, random_state=random_state) # Plot the points in the original space plt.figure() ax = plt.gca() # Draw the graph nodes for i in range(X.shape[0]): ax.text(X[i, 0], X[i, 1], str(i), va='center', ha='center') ax.scatter(X[i, 0], X[i, 1], s=300, c=cm.Set1(y[[i]]), alpha=0.4) def p_i(X, i):
def test_random_search_cv_results(params): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # scipy.stats dists now supports `seed` but we still support scipy 0.12 # which doesn't support the seed. Hence the assertions in the test for # random_search alone should not depend on randomization. n_splits = 3 n_search_iter = 30 random_search = dcv.RandomizedSearchCV( SVC(), n_iter=n_search_iter, cv=n_splits, iid=False, param_distributions=params, return_train_score=True, ) random_search.fit(X, y) random_search_iid = dcv.RandomizedSearchCV( SVC(), n_iter=n_search_iter, cv=n_splits, iid=True, param_distributions=params, return_train_score=True, ) random_search_iid.fit(X, y) param_keys = ("param_C", "param_gamma") score_keys = ( "mean_test_score", "mean_train_score", "rank_test_score", "split0_test_score", "split1_test_score", "split2_test_score", "split0_train_score", "split1_train_score", "split2_train_score", "std_test_score", "std_train_score", "mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time", ) n_cand = n_search_iter for search, iid in zip((random_search, random_search_iid), (False, True)): assert iid == search.iid cv_results = search.cv_results_ # Check results structure check_cv_results_array_types(cv_results, param_keys, score_keys) check_cv_results_keys(cv_results, param_keys, score_keys, n_cand) # For random_search, all the param array vals should be unmasked assert not (any(cv_results["param_C"].mask) or any(cv_results["param_gamma"].mask))
def test_classif_two_classes(self, NeuralNet): X, y = make_classification() X = X.astype(floatX) y = y.astype(np.int32) self.classif(NeuralNet, X, y)
def test_calibration(): """Test calibration objects with isotonic and sigmoid""" n_samples = 100 X, y = make_classification(n_samples=2 * n_samples, n_features=6, random_state=42) sample_weight = np.random.RandomState(seed=42).uniform(size=y.size) X -= X.min() # MultinomialNB only allows positive X # split train and test X_train, y_train, sw_train = \ X[:n_samples], y[:n_samples], sample_weight[:n_samples] X_test, y_test = X[n_samples:], y[n_samples:] # Naive-Bayes clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train) prob_pos_clf = clf.predict_proba(X_test)[:, 1] pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1) assert_raises(ValueError, pc_clf.fit, X, y) # Naive Bayes with calibration for this_X_train, this_X_test in [(X_train, X_test), (sparse.csr_matrix(X_train), sparse.csr_matrix(X_test))]: for method in ['isotonic', 'sigmoid']: pc_clf = CalibratedClassifierCV(clf, method=method, cv=2) # Note that this fit overwrites the fit on the entire training # set pc_clf.fit(this_X_train, y_train, sample_weight=sw_train) prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1] # Check that brier score has improved after calibration assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss(y_test, prob_pos_pc_clf)) # Check invariance against relabeling [0, 1] -> [1, 2] pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [-1, 1] pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train) prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1] assert_array_almost_equal(prob_pos_pc_clf, prob_pos_pc_clf_relabeled) # Check invariance against relabeling [0, 1] -> [1, 0] pc_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train) prob_pos_pc_clf_relabeled = \ pc_clf.predict_proba(this_X_test)[:, 1] if method == "sigmoid": assert_array_almost_equal(prob_pos_pc_clf, 1 - prob_pos_pc_clf_relabeled) else: # Isotonic calibration is not invariant against relabeling # but should improve in both cases assert_greater(brier_score_loss(y_test, prob_pos_clf), brier_score_loss((y_test + 1) % 2, prob_pos_pc_clf_relabeled)) # check that calibration can also deal with regressors that have # a decision_function clf_base_regressor = CalibratedClassifierCV(Ridge()) clf_base_regressor.fit(X_train, y_train) clf_base_regressor.predict(X_test) # Check failure cases: # only "isotonic" and "sigmoid" should be accepted as methods clf_invalid_method = CalibratedClassifierCV(clf, method="foo") assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train) # base-estimators should provide either decision_function or # predict_proba (most regressors, for instance, should fail) clf_base_regressor = \ CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid") assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
def cls_data(): X, Y = datasets.make_classification(1000, 10, n_classes=2, random_state=11) X = X.astype(np.float64) Y = Y.astype(np.float64).reshape(-1, 1) Y[Y == 0] = -1 return torch.from_numpy(X), torch.from_numpy(Y)
nb_samples = 100 nb_unlabeled = 75 tolerance = 0.01 def rbf(x1, x2, gamma=10.0): n = np.linalg.norm(x1 - x2, ord=1) return np.exp(-gamma * np.power(n, 2)) if __name__ == '__main__': # Create the dataset X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, random_state=1000) Y[Y == 0] = -1 Y[nb_samples - nb_unlabeled:nb_samples] = 0 # Show the original dataset sns.set() fig, ax = plt.subplots(figsize=(12, 9)) ax.scatter(X[Y == -1, 0], X[Y == -1, 1], color='r', marker='s', s=150, label="Class 0")
import numpy as np from sklearn.datasets import make_classification from sklearn.cross_validation import train_test_split from sklearn.linear_model import Perceptron from sklearn.metrics import accuracy_score from matplotlib.colors import ListedColormap from matplotlib import pyplot as plt from sklearn.metrics import classification_report N =200 # 訓練データ生成 dat = make_classification(n_samples=N, n_features=2, n_informative=2, n_redundant=0, n_classes=2, n_clusters_per_class=1, weights=[0.7, 0.3], flip_y=0.01, shuffle=True, random_state=200) X = dat[0] y = dat[1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) ppn = Perceptron(n_iter=1, eta0=0.1) ppn.fit(X_train, y_train) y_pred = ppn.predict(X_test) print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) target_names = ['0', '1'] print(classification_report(y_test, y_pred, target_names=target_names)) def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02): #setup marker generator and color map markers = ('s', 'x', 'o', '^', 'v') colors = ('r', 'b', 'g', 'y', 'm')
import pandas as pd import numpy as np # import sklearn # print('sklearn: %s' % sklearn.__version__) from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix # create and configure model X, y = make_classification(n_samples=16, n_features=2, n_informative=2, n_redundant=0, random_state=0) # model = LogisticRegression().fit(X, y) model = LogisticRegression(solver='liblinear').fit(X, y) # model = LogisticRegression(solver='lbfgs').fit(X, y) y_hat = model.predict(X) f_value = model.decision_function(X) df = pd.DataFrame(np.vstack([f_value, y_hat, y]).T, columns=["f", "y_hat", "y"]) df.sort_values("f", ascending=False).reset_index(drop=True) print(df)
import numpy as np import pandas as pd from sklearn.datasets import make_classification, make_moons from sklearn.datasets import make_gaussian_quantiles data = make_classification(n_samples=3000, n_features=200, n_informative=40, n_redundant=10, n_repeated=5, class_sep=1.0, shuffle=False, flip_y=.02, random_state=147) feat_gt = np.asarray([*([True] * 55), *([False] * 145)]) pd.to_pickle((data[0], data[1], feat_gt), 'hc_bl.pkl') # MOON moon_data = make_moons(n_samples=4000) feat_gt = np.asarray([*([True] * 2), *([False] * 0)]) # pickled tuple: (X, y, feat_ground_truth) pd.to_pickle((moon_data[0], moon_data[1], feat_gt), 'hcm_1.pkl') moon_data = make_moons(n_samples=3000, noise=0.32) moon_data = (np.hstack( (moon_data[0], moon_data[0] * [np.random.rand(), np.random.randint(1e9)], np.vstack(moon_data[0][:, 0] * np.random.rand()))), moon_data[1])
fm=2*(precision*recall/(precision+recall)) print(fm) def plot_roc_curve(rfpr, tpr): plt.plot(fpr, tpr, color='orange', label='ROC') plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend() plt.show() data_X, class_label = make_classification(n_samples=1000, n_classes=2, weights=[1, 1], random_state=1) trainX, testX, trainy, testy = train_test_split(data_X, class_label, test_size=0.3, random_state=1) model = RandomForestClassifier() model.fit(trainX, trainy) probs = model.predict_proba(testX) probs = probs[:, 1] auc = roc_auc_score(testy, probs) print('AUC: %.2f' % auc) fpr, tpr, thresholds = roc_curve(testy, probs)
# for ind, i in enumerate(tree_params['criterion']): # plt.plot(tree_params['max_depth'], scores[ind], label=str(i)) # plt.legend(loc='best') # plt.xlabel('max_depth') # plt.ylabel('Accuracy') # plt.show() # clf = DecisionTree(max_depth=9, criterion='entropy') # clf.fit(X_train, y_train) # probs = clf.predict_proba(X_test) # mean_probs = np.mean(probs, axis=0) # print(mean_probs) X, y = make_classification(n_features=2, n_redundant=0, n_samples=400, random_state=RANDOM_STATE) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE) clf = DecisionTree(max_depth=4, criterion='gini') clf.fit(X_train, y_train) y_pred = clf.predict(X_test) prob_pred = clf.predict_proba(X_test) accuracy = accuracy_score(y_test, y_pred) print("Accuracy:", accuracy) if (sum(np.argmax(prob_pred,axis=1) - y_pred) == 0): print('predict_proba works!')
assert_allclose(pca_full.explained_variance_, pca_other.explained_variance_, rtol=5e-2) assert_allclose( pca_full.explained_variance_ratio_, pca_other.explained_variance_ratio_, rtol=5e-2, ) @pytest.mark.parametrize( "X", [ np.random.RandomState(0).randn(100, 80), datasets.make_classification(100, 80, n_informative=78, random_state=0)[0], ], ids=["random-data", "correlated-data"], ) @pytest.mark.parametrize("svd_solver", PCA_SOLVERS) def test_pca_explained_variance_empirical(X, svd_solver): pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0) X_pca = pca.fit_transform(X) assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0)) expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0] expected_result = sorted(expected_result, reverse=True)[:2] assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3) @pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
# -*- coding: utf-8 -*- """ Created on Sun Apr 12 23:31:18 2020 @author: Jie.Hu """ # mean shift clustering import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.cluster import MeanShift # define dataset X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4) # define the model model = MeanShift() # fit model and predict clusters yhat = model.fit_predict(X) # retrieve unique clusters clusters = np.unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = np.where(yhat == cluster) # create scatter of these samples plt.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot plt.show()
A recursive feature elimination example with automatic tuning of the number of features selected with cross-validation. """ print(__doc__) import matplotlib.pyplot as plt from sklearn.datasets import make_classification from sklearn.feature_selection import RFECV from sklearn.model_selection import StratifiedKFold from sklearn.svm import SVC # Build a classification task using 3 informative features X, y = make_classification(n_samples=1000, n_features=25, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, random_state=0) # Create the RFE object and compute a cross-validated score. svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure()
def test_metrics(): X, y = make_classification(n_samples=500, n_features=5, n_informative=5, n_redundant=0, n_repeated=0, n_classes=2) X = StandardScaler().fit(X).transform(X) model = SKLogisticRegression() model.fit(X, y) y_pred = model.predict(X) y_proba = model.predict_proba(X)[:, 1] y_true = y print('Confusion Matrix:', confusion_matrix(y_true, y_pred) == metrics.confusion_matrix(y_true, y_pred), sep='\n') print('Delta Accuracy:', accuracy(y_true, y_pred) - metrics.accuracy_score(y_true, y_pred)) print( 'Delta Micro Recall:', recall(y_true, y_true, kind='micro') - metrics.recall_score(y_true, y_pred, average='micro')) print( 'Delta Micro Precision:', precision(y_true, y_pred, kind='micro') - metrics.precision_score(y_true, y_pred, average='micro')) print( 'Delta Micro F1-score:', f1_score(y_true, y_pred, kind='micro') - metrics.f1_score(y_true, y_pred, average='micro')) print( 'Delta Macro Recall:', recall(y_true, y_true, kind='macro') - metrics.recall_score(y_true, y_pred, average='macro')) print( 'Delta Macro Precision:', precision(y_true, y_pred, kind='macro') - metrics.precision_score(y_true, y_pred, average='macro')) print( 'Delta Macro F1-score:', f1_score(y_true, y_pred, kind='macro') - metrics.f1_score(y_true, y_pred, average='macro')) print( 'Delta All Recall:', recall(y_true, y_true, kind='all') - metrics.recall_score(y_true, y_pred, average=None)) print( 'Delta All Precision:', precision(y_true, y_pred, kind='all') - metrics.precision_score(y_true, y_pred, average=None)) print( 'Delta All F1-score:', f1_score(y_true, y_pred, kind='all') - metrics.f1_score(y_true, y_pred, average=None)) print('Delta log loss:', log_loss_score(y_true, y_proba) - metrics.log_loss(y_true, y_proba)) print( 'Delta zero one loss:', zero_one_loss(y_true, y_true) - metrics.zero_one_loss(y_true, y_true)) print('*' * 80) X, y = make_regression(n_samples=500, n_features=5, n_informative=5, n_targets=1) model = SKLinearRegression() model.fit(X, y) y_pred = model.predict(X) y_true = y print( 'Delta mean_absolute_error:', mean_absolute_error(y_true, y_pred) - metrics.mean_absolute_error(y_true, y_pred)) print( 'Delta mean_squared_error:', mean_squared_error(y_true, y_pred) - metrics.mean_squared_error(y_true, y_pred)) print('Delta r2_score:', r2_score(y_true, y_pred) - metrics.r2_score(y_true, y_pred))
def test_logistic_regression_sample_weights(): X, y = make_classification(n_samples=20, n_features=5, n_informative=3, n_classes=2, random_state=0) sample_weight = y + 1 for LR in [LogisticRegression, LogisticRegressionCV]: # Test that passing sample_weight as ones is the same as # not passing them at all (default None) for solver in ['lbfgs', 'liblinear']: clf_sw_none = LR(solver=solver, fit_intercept=False, random_state=42) clf_sw_none.fit(X, y) clf_sw_ones = LR(solver=solver, fit_intercept=False, random_state=42) clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0])) assert_array_almost_equal( clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4) # Test that sample weights work the same with the lbfgs, # newton-cg, and 'sag' solvers clf_sw_lbfgs = LR(solver='lbfgs', fit_intercept=False, random_state=42) clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight) clf_sw_n = LR(solver='newton-cg', fit_intercept=False, random_state=42) clf_sw_n.fit(X, y, sample_weight=sample_weight) clf_sw_sag = LR(solver='sag', fit_intercept=False, tol=1e-10, random_state=42) # ignore convergence warning due to small dataset with ignore_warnings(): clf_sw_sag.fit(X, y, sample_weight=sample_weight) clf_sw_liblinear = LR(solver='liblinear', fit_intercept=False, random_state=42) clf_sw_liblinear.fit(X, y, sample_weight=sample_weight) assert_array_almost_equal( clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4) assert_array_almost_equal( clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4) assert_array_almost_equal( clf_sw_lbfgs.coef_, clf_sw_liblinear.coef_, decimal=4) # Test that passing class_weight as [1,2] is the same as # passing class weight = [1,1] but adjusting sample weights # to be 2 for all instances of class 2 for solver in ['lbfgs', 'liblinear']: clf_cw_12 = LR(solver=solver, fit_intercept=False, class_weight={0: 1, 1: 2}, random_state=42) clf_cw_12.fit(X, y) clf_sw_12 = LR(solver=solver, fit_intercept=False, random_state=42) clf_sw_12.fit(X, y, sample_weight=sample_weight) assert_array_almost_equal( clf_cw_12.coef_, clf_sw_12.coef_, decimal=4) # Test the above for l1 penalty and l2 penalty with dual=True. # since the patched liblinear code is different. clf_cw = LogisticRegression( solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2}, penalty="l1", tol=1e-5, random_state=42) clf_cw.fit(X, y) clf_sw = LogisticRegression( solver="liblinear", fit_intercept=False, penalty="l1", tol=1e-5, random_state=42) clf_sw.fit(X, y, sample_weight) assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4) clf_cw = LogisticRegression( solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2}, penalty="l2", dual=True, random_state=42) clf_cw.fit(X, y) clf_sw = LogisticRegression( solver="liblinear", fit_intercept=False, penalty="l2", dual=True, random_state=42) clf_sw.fit(X, y, sample_weight) assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)
def test_saga_sparse(): # Test LogRegCV with solver='liblinear' works for sparse matrices X, y = make_classification(n_samples=10, n_features=5, random_state=0) clf = LogisticRegressionCV(solver='saga') clf.fit(sparse.csr_matrix(X), y)
if train_predictions_1[i] == train_predictions_2[i]: newX_l.append(X_unlabeled[i]) newy_l.append(train_predictions_1[i]) dele.append(i) newX_l = np.array(newX_l) newy_l = np.array(newy_l) if newX_l.shape[0] == 0: break X_labeled = np.append(X_labeled, newX_l, axis=0) y_labeled = np.append(y_labeled, newy_l) X_unlabeled = np.delete(X_unlabeled, dele, axis=0) return X_labeled, y_labeled # Read data X, y = make_classification(n_samples=2000, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=2) plt.figure() plt.scatter(X[:, 0], X[:, 1], marker='o', c=y) # Normalization X_norm = preprocessing.scale(X) num_labeled = 25 num_unlabeled = [0, 10, 20, 40, 80, 160, 320, 640, 1280] err_lda = {} err_clustering = {} err_co = {} log_lda = {} log_clustering = {}
import seaborn as sns NFEAT = 8 NINFO = 5 NRED = 0 NREP = 0 NCLASS = 3 NCLUSTCLASS = 1 nUnrelated = NFEAT - NINFO - NRED - NREP [X, Y] = dat.make_classification(n_samples=100, n_features=NFEAT, n_informative=NINFO, n_redundant=NRED, n_repeated=NREP, n_classes=NCLASS, n_clusters_per_class=NCLUSTCLASS, class_sep=5, shuffle=False) nUseful = NINFO + NRED + NREP print("Useful features: first {}".format(NINFO)) sns.set() binwidth = 1 fig1, sub = plt.subplots(2, int(NFEAT / 2)) fig1.subplots_adjust(wspace=0.6, hspace=0.6) colors = ['b', 'r', 'g', 'y', 'c'] classes = []
def dataset(): X, y = make_classification(100, 5, random_state=42) X = X.astype(np.float64) y = y.astype(np.float64) return X, y
if alpha >= self.H: return self.H elif alpha <= self.L: return self.L else: return alpha def K(self, x_i, x_j): return np.dot(x_i, x_j) if __name__ == '__main__': X, Y = make_classification(n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=2, n_clusters_per_class=1) #X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) Y = [pow(-1, num + 1) for num in Y] s = SMO(X, Y, 1, 100) w = s.w() b = s.b x1 = max(X, key=lambda x: x[0]) x2 = min(X, key=lambda x: x[0]) slope = -w[0] / w[1] intercept = -b / w[1] print(slope)
def test_grid_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) n_splits = 3 n_grid_points = 6 params = [ dict(kernel=["rbf"], C=[1, 10], gamma=[0.1, 1]), dict(kernel=["poly"], degree=[1, 2]), ] grid_search = dcv.GridSearchCV( SVC(gamma="auto"), cv=n_splits, iid=False, param_grid=params, return_train_score=True, ) grid_search.fit(X, y) grid_search_iid = dcv.GridSearchCV( SVC(gamma="auto"), cv=n_splits, iid=True, param_grid=params, return_train_score=True, ) grid_search_iid.fit(X, y) param_keys = ("param_C", "param_degree", "param_gamma", "param_kernel") score_keys = ( "mean_test_score", "mean_train_score", "rank_test_score", "split0_test_score", "split1_test_score", "split2_test_score", "split0_train_score", "split1_train_score", "split2_train_score", "std_test_score", "std_train_score", "mean_fit_time", "std_fit_time", "mean_score_time", "std_score_time", ) n_candidates = n_grid_points for search, iid in zip((grid_search, grid_search_iid), (False, True)): assert iid == search.iid cv_results = search.cv_results_ # Check if score and timing are reasonable assert all(cv_results["rank_test_score"] >= 1) assert all( all(cv_results[k] >= 0) for k in score_keys if k != "rank_test_score") assert all( all(cv_results[k] <= 1) for k in score_keys if "time" not in k and k != "rank_test_score") # Check cv_results structure check_cv_results_array_types(cv_results, param_keys, score_keys) check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) # Check masking cv_results = grid_search.cv_results_ n_candidates = len(grid_search.cv_results_["params"]) assert all(( cv_results["param_C"].mask[i] and cv_results["param_gamma"].mask[i] and not cv_results["param_degree"].mask[i]) for i in range(n_candidates) if cv_results["param_kernel"][i] == "linear") assert all((not cv_results["param_C"].mask[i] and not cv_results["param_gamma"].mask[i] and cv_results["param_degree"].mask[i]) for i in range(n_candidates) if cv_results["param_kernel"][i] == "rbf")
import matplotlib.pyplot as plt from sklearn import datasets from sklearn.naive_bayes import GaussianNB from sklearn.svm import LinearSVC from sklearn.linear_model import LogisticRegression from sklearn.metrics import (brier_score_loss, precision_score, recall_score, f1_score) from sklearn.calibration import CalibratedClassifierCV, calibration_curve from sklearn.cross_validation import train_test_split # Create dataset of classification task with many redundant and few # informative features X, y = datasets.make_classification(n_samples=100000, n_features=20, n_informative=2, n_redundant=10, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=42) def plot_calibration_curve(est, name, fig_index): """Plot calibration curve for est w/o and with calibration. """ # Calibrated with isotonic calibration isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic') # Calibrated with sigmoid calibration