def test_predict_on_toy_problem(): """Manually check predicted class labels for toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]) y = np.array([1, 1, 1, 2, 2, 2]) assert all(clf1.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2]) assert all(clf2.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2]) assert all(clf3.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2]) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[1, 1, 1]) assert all(eclf.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2]) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 1]) assert all(eclf.fit(X, y).predict(X)) == all([1, 1, 1, 2, 2, 2])
def test_set_params(): """set_params should be able to set estimators""" clf1 = LogisticRegression(random_state=123, C=1.0) clf2 = RandomForestClassifier(random_state=123, max_depth=None) clf3 = GaussianNB() eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft', weights=[1, 2]) assert 'lr' in eclf1.named_estimators assert eclf1.named_estimators.lr is eclf1.estimators[0][1] assert eclf1.named_estimators.lr is eclf1.named_estimators['lr'] eclf1.fit(X, y) assert 'lr' in eclf1.named_estimators_ assert eclf1.named_estimators_.lr is eclf1.estimators_[0] assert eclf1.named_estimators_.lr is eclf1.named_estimators_['lr'] eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft', weights=[1, 2]) eclf2.set_params(nb=clf2).fit(X, y) assert not hasattr(eclf2, 'nb') assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) assert eclf2.estimators[0][1].get_params() == clf1.get_params() assert eclf2.estimators[1][1].get_params() == clf2.get_params() eclf1.set_params(lr__C=10.0) eclf2.set_params(nb__max_depth=5) assert eclf1.estimators[0][1].get_params()['C'] == 10.0 assert eclf2.estimators[1][1].get_params()['max_depth'] == 5 assert (eclf1.get_params()["lr__C"] == eclf1.get_params()["lr"].get_params()['C'])
def test_ovr_always_present(): # Test that ovr works with classes that are always present or absent. # Note: tests is the case where _ConstantPredictor is utilised X = np.ones((10, 2)) X[:5, :] = 0 # Build an indicator matrix where two features are always on. # As list of lists, it would be: [[int(i >= 5), 2, 3] for i in range(10)] y = np.zeros((10, 3)) y[5:, 0] = 1 y[:, 1] = 1 y[:, 2] = 1 ovr = OneVsRestClassifier(LogisticRegression()) assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) assert np.unique(y_pred[:, -2:]) == 1 y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.ones(X.shape[0])) # y has a constantly absent label y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
def test_compute_class_weight_invariance(): # Test that results with class_weight="balanced" is invariant wrt # class imbalance if the number of samples is identical. # The test uses a balanced two class dataset with 100 datapoints. # It creates three versions, one where class 1 is duplicated # resulting in 150 points of class 1 and 50 of class 0, # one where there are 50 points in class 1 and 150 in class 0, # and one where there are 100 points of each class (this one is balanced # again). # With balancing class weights, all three should give the same model. X, y = make_blobs(centers=2, random_state=0) # create dataset where class 1 is duplicated twice X_1 = np.vstack([X] + [X[y == 1]] * 2) y_1 = np.hstack([y] + [y[y == 1]] * 2) # create dataset where class 0 is duplicated twice X_0 = np.vstack([X] + [X[y == 0]] * 2) y_0 = np.hstack([y] + [y[y == 0]] * 2) # duplicate everything X_ = np.vstack([X] * 2) y_ = np.hstack([y] * 2) # results should be identical logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1) logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0) logreg = LogisticRegression(class_weight="balanced").fit(X_, y_) assert_array_almost_equal(logreg1.coef_, logreg0.coef_) assert_array_almost_equal(logreg.coef_, logreg0.coef_)
def test_classifier_results(): """tests if classifier results match target""" alpha = .1 n_features = 20 n_samples = 10 tol = .01 max_iter = 200 rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) y = np.dot(X, w) y = np.sign(y) clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) pred1 = clf1.predict(X) pred2 = clf2.predict(X) assert_almost_equal(pred1, y, decimal=12) assert_almost_equal(pred2, y, decimal=12)
def test_binary_classifier_class_weight(): """tests binary classifier with classweights for each class""" alpha = .1 n_samples = 50 n_iter = 20 tol = .00001 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=2, random_state=10, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) y_tmp = np.ones(n_samples) y_tmp[y != classes[1]] = -1 y = y_tmp class_weight = {1: .45, -1: .55} clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=n_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class='ovr', class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] spweights, spintercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sample_weight=sample_weight, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sparse=True, sample_weight=sample_weight, fit_intercept=fit_intercept) assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) assert_almost_equal(clf1.intercept_, spintercept, decimal=1) assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
def test_tie_situation(): """Check voting classifier selects smaller class label in tie situation.""" clf1 = LogisticRegression(random_state=123, solver='liblinear') clf2 = RandomForestClassifier(random_state=123) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard') assert clf1.fit(X, y).predict(X)[73] == 2 assert clf2.fit(X, y).predict(X)[73] == 1 assert eclf.fit(X, y).predict(X)[73] == 1
def test_scoring_is_not_metric(): assert_raises_regexp(ValueError, 'make_scorer', check_scoring, LogisticRegression(), f1_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, LogisticRegression(), roc_auc_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, Ridge(), r2_score) assert_raises_regexp(ValueError, 'make_scorer', check_scoring, KMeans(), cluster_module.adjusted_rand_score)
def test_predictproba_hardvoting(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='hard') msg = "predict_proba is not available when voting='hard'" with pytest.raises(AttributeError, match=msg): eclf.predict_proba assert not hasattr(eclf, "predict_proba") eclf.fit(X, y) assert not hasattr(eclf, "predict_proba")
def test_sag_classifier_computed_correctly(): """tests if the binary classifier is computed correctly""" alpha = .1 n_samples = 50 n_iter = 50 tol = .00001 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) y_tmp = np.ones(n_samples) y_tmp[y != classes[1]] = -1 y = y_tmp clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=n_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class='ovr') clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) spweights, spintercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, sparse=True, fit_intercept=fit_intercept) assert_array_almost_equal(clf1.coef_.ravel(), spweights.ravel(), decimal=2) assert_almost_equal(clf1.intercept_, spintercept, decimal=1) assert_array_almost_equal(clf2.coef_.ravel(), spweights2.ravel(), decimal=2) assert_almost_equal(clf2.intercept_, spintercept2, decimal=1)
def test_classifier_matching(): n_samples = 20 X, y = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=0.1) y[y == 0] = -1 alpha = 1.1 fit_intercept = True step_size = get_step_size(X, alpha, fit_intercept) for solver in ['sag', 'saga']: if solver == 'sag': n_iter = 80 else: # SAGA variance w.r.t. stream order is higher n_iter = 300 clf = LogisticRegression(solver=solver, fit_intercept=fit_intercept, tol=1e-11, C=1. / alpha / n_samples, max_iter=n_iter, random_state=10, multi_class='ovr') clf.fit(X, y) weights, intercept = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, fit_intercept=fit_intercept, saga=solver == 'saga') weights2, intercept2 = sag(X, y, step_size, alpha, n_iter=n_iter, dloss=log_dloss, fit_intercept=fit_intercept, saga=solver == 'saga') weights = np.atleast_2d(weights) intercept = np.atleast_1d(intercept) weights2 = np.atleast_2d(weights2) intercept2 = np.atleast_1d(intercept2) assert_array_almost_equal(weights, clf.coef_, decimal=9) assert_array_almost_equal(intercept, clf.intercept_, decimal=9) assert_array_almost_equal(weights2, clf.coef_, decimal=9) assert_array_almost_equal(intercept2, clf.intercept_, decimal=9)
def test_classifier_chain_fit_and_predict_with_sparse_data(): # Fit classifier chain with sparse data X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X_sparse, Y) Y_pred_sparse = classifier_chain.predict(X_sparse) classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred_dense = classifier_chain.predict(X) assert_array_equal(Y_pred_sparse, Y_pred_dense)
def test_multiclass_multioutput_estimator_predict_proba(): seed = 542 # make test deterministic rng = np.random.RandomState(seed) # random features X = rng.normal(size=(5, 5)) # random labels y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1) # 2 classes y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1) # 3 classes Y = np.concatenate([y1, y2], axis=1) clf = MultiOutputClassifier( LogisticRegression(solver='liblinear', random_state=seed)) clf.fit(X, Y) y_result = clf.predict_proba(X) y_actual = [ np.array([[0.23481764, 0.76518236], [0.67196072, 0.32803928], [0.54681448, 0.45318552], [0.34883923, 0.65116077], [0.73687069, 0.26312931]]), np.array([[0.5171785, 0.23878628, 0.24403522], [0.22141451, 0.64102704, 0.13755846], [0.16751315, 0.18256843, 0.64991843], [0.27357372, 0.55201592, 0.17441036], [0.65745193, 0.26062899, 0.08191907]]) ] for i in range(len(y_actual)): assert_almost_equal(y_result[i], y_actual[i])
def test_permutation_importance_mixed_types_pandas(): pd = pytest.importorskip("pandas") rng = np.random.RandomState(42) n_repeats = 5 # Last column is correlated with y X = pd.DataFrame({ 'col1': [1.0, 2.0, 3.0, np.nan], 'col2': ['a', 'b', 'a', 'b'] }) y = np.array([0, 1, 0, 1]) num_preprocess = make_pipeline(SimpleImputer(), StandardScaler()) preprocess = ColumnTransformer([('num', num_preprocess, ['col1']), ('cat', OneHotEncoder(), ['col2'])]) clf = make_pipeline(preprocess, LogisticRegression(solver='lbfgs')) clf.fit(X, y) result = permutation_importance(clf, X, y, n_repeats=n_repeats, random_state=rng) assert result.importances.shape == (X.shape[1], n_repeats) # the correlated feature with y is the last column and should # have the highest importance assert np.all(result.importances_mean[-1] > result.importances_mean[:-1])
def test_estimator_init(): eclf = VotingClassifier(estimators=[]) msg = ('Invalid `estimators` attribute, `estimators` should be' ' a list of (string, estimator) tuples') assert_raise_message(AttributeError, msg, eclf.fit, X, y) clf = LogisticRegression(random_state=1) eclf = VotingClassifier(estimators=[('lr', clf)], voting='error') msg = ('Voting must be \'soft\' or \'hard\'; got (voting=\'error\')') assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('lr', clf)], weights=[1, 2]) msg = ('Number of `estimators` and weights must be equal' '; got 2 weights, 1 estimators') assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('lr', clf), ('lr', clf)], weights=[1, 2]) msg = "Names provided are not unique: ['lr', 'lr']" assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('lr__', clf)]) msg = "Estimator names must not contain __: got ['lr__']" assert_raise_message(ValueError, msg, eclf.fit, X, y) eclf = VotingClassifier(estimators=[('estimators', clf)]) msg = "Estimator names conflict with constructor arguments: ['estimators']" assert_raise_message(ValueError, msg, eclf.fit, X, y)
def test_auto_weight(): # Test class weights for imbalanced data from mrex.linear_model import LogisticRegression # We take as dataset the two-dimensional projection of iris so # that it is not separable and remove half of predictors from # class 1. # We add one to the targets as a non-regression test: # class_weight="balanced" # used to work only when the labels where a range [0..K). from mrex.utils import compute_class_weight X, y = iris.data[:, :2], iris.target + 1 unbalanced = np.delete(np.arange(y.size), np.where(y > 2)[0][::2]) classes = np.unique(y[unbalanced]) class_weights = compute_class_weight('balanced', classes, y[unbalanced]) assert np.argmax(class_weights) == 2 for clf in (svm.SVC(kernel='linear'), svm.LinearSVC(random_state=0), LogisticRegression()): # check that score is better when class='balanced' is set. y_pred = clf.fit(X[unbalanced], y[unbalanced]).predict(X) clf.set_params(class_weight='balanced') y_pred_balanced = clf.fit( X[unbalanced], y[unbalanced], ).predict(X) assert (metrics.f1_score(y, y_pred, average='macro') <= metrics.f1_score(y, y_pred_balanced, average='macro'))
def test_transform(): """Check transform method of VotingClassifier on toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]]) y = np.array([1, 1, 2, 2]) eclf1 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft').fit(X, y) eclf2 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=True).fit(X, y) eclf3 = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', flatten_transform=False).fit(X, y) assert_array_equal(eclf1.transform(X).shape, (4, 6)) assert_array_equal(eclf2.transform(X).shape, (4, 6)) assert_array_equal(eclf3.transform(X).shape, (3, 4, 2)) assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X)) assert_array_almost_equal( eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X) )
def test_set_params_nested_pipeline(): estimator = Pipeline([ ('a', Pipeline([ ('b', DummyRegressor()) ])) ]) estimator.set_params(a__b__alpha=0.001, a__b=Lasso()) estimator.set_params(a__steps=[('b', LogisticRegression())], a__b__C=5)
def test_notfitted(): eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()), ('lr2', LogisticRegression())], voting='soft') ereg = VotingRegressor([('dr', DummyRegressor())]) msg = ("This %s instance is not fitted yet. Call \'fit\'" " with appropriate arguments before using this method.") assert_raise_message(NotFittedError, msg % 'VotingClassifier', eclf.predict, X) assert_raise_message(NotFittedError, msg % 'VotingClassifier', eclf.predict_proba, X) assert_raise_message(NotFittedError, msg % 'VotingClassifier', eclf.transform, X) assert_raise_message(NotFittedError, msg % 'VotingRegressor', ereg.predict, X_r) assert_raise_message(NotFittedError, msg % 'VotingRegressor', ereg.transform, X_r)
def test_classifier_single_class(): """tests if ValueError is thrown with only one class""" X = [[1, 2], [3, 4]] y = [1, 1] assert_raise_message( ValueError, "This solver needs samples of at least 2 classes " "in the data", LogisticRegression(solver='sag').fit, X, y)
def test_majority_label_iris(): """Check classification by majority label on dataset iris.""" clf1 = LogisticRegression(solver='liblinear', random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard') scores = cross_val_score(eclf, X, y, scoring='accuracy') assert_almost_equal(scores.mean(), 0.95, decimal=2)
def plot_calibration_curve(est, name, fig_index): """Plot calibration curve for est w/o and with calibration. """ # Calibrated with isotonic calibration isotonic = CalibratedClassifierCV(est, cv=2, method='isotonic') # Calibrated with sigmoid calibration sigmoid = CalibratedClassifierCV(est, cv=2, method='sigmoid') # Logistic regression with no calibration as baseline lr = LogisticRegression(C=1.) fig = plt.figure(fig_index, figsize=(10, 10)) ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2) ax2 = plt.subplot2grid((3, 1), (2, 0)) ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") for clf, name in [(lr, 'Logistic'), (est, name), (isotonic, name + ' + Isotonic'), (sigmoid, name + ' + Sigmoid')]: clf.fit(X_train, y_train) y_pred = clf.predict(X_test) if hasattr(clf, "predict_proba"): prob_pos = clf.predict_proba(X_test)[:, 1] else: # use decision function prob_pos = clf.decision_function(X_test) prob_pos = \ (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min()) clf_score = brier_score_loss(y_test, prob_pos, pos_label=y.max()) print("%s:" % name) print("\tBrier: %1.3f" % (clf_score)) print("\tPrecision: %1.3f" % precision_score(y_test, y_pred)) print("\tRecall: %1.3f" % recall_score(y_test, y_pred)) print("\tF1: %1.3f\n" % f1_score(y_test, y_pred)) fraction_of_positives, mean_predicted_value = \ calibration_curve(y_test, prob_pos, n_bins=10) ax1.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s (%1.3f)" % (name, clf_score)) ax2.hist(prob_pos, range=(0, 1), bins=10, label=name, histtype="step", lw=2) ax1.set_ylabel("Fraction of positives") ax1.set_ylim([-0.05, 1.05]) ax1.legend(loc="lower right") ax1.set_title('Calibration plots (reliability curve)') ax2.set_xlabel("Mean predicted value") ax2.set_ylabel("Count") ax2.legend(loc="upper center", ncol=2) plt.tight_layout()
def test_score_samples_on_pipeline_without_score_samples(): X = np.array([[1], [2]]) y = np.array([1, 2]) # Test that a pipeline does not have score_samples method when the final # step of the pipeline does not have score_samples defined. pipe = make_pipeline(LogisticRegression()) pipe.fit(X, y) with pytest.raises(AttributeError, match="'LogisticRegression' object has no attribute " "'score_samples'"): pipe.score_samples(X)
def test_classifier_chain_vs_independent_models(): # Verify that an ensemble of classifier chains (each of length # N) can achieve a higher Jaccard similarity score than N independent # models X, Y = generate_multilabel_dataset_with_correlations() X_train = X[:600, :] X_test = X[600:, :] Y_train = Y[:600, :] Y_test = Y[600:, :] ovr = OneVsRestClassifier(LogisticRegression()) ovr.fit(X_train, Y_train) Y_pred_ovr = ovr.predict(X_test) chain = ClassifierChain(LogisticRegression()) chain.fit(X_train, Y_train) Y_pred_chain = chain.predict(X_test) assert (jaccard_score(Y_test, Y_pred_chain, average='samples') > jaccard_score(Y_test, Y_pred_ovr, average='samples'))
def test_weights_iris(): """Check classification by average probabilities on dataset iris.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 2, 10]) scores = cross_val_score(eclf, X, y, scoring='accuracy') assert_almost_equal(scores.mean(), 0.93, decimal=2)
def test_base_chain_fit_and_predict_with_sparse_data_and_cv(): # Fit base chain with sparse data cross_val_predict X, Y = generate_multilabel_dataset_with_correlations() X_sparse = sp.csr_matrix(X) base_chains = [ ClassifierChain(LogisticRegression(), cv=3), RegressorChain(Ridge(), cv=3) ] for chain in base_chains: chain.fit(X_sparse, Y) Y_pred = chain.predict(X_sparse) assert Y_pred.shape == Y.shape
def test_set_estimator_none(drop): """VotingClassifier set_params should be able to set estimators as None or drop""" # Test predict clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(n_estimators=10, random_state=123) clf3 = GaussianNB() eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 0, 0.5]).fit(X, y) eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard', weights=[1, 1, 0.5]) eclf2.set_params(rf=drop).fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert dict(eclf2.estimators)["rf"] is drop assert len(eclf2.estimators_) == 2 assert all(isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_) assert eclf2.get_params()["rf"] is drop eclf1.set_params(voting='soft').fit(X, y) eclf2.set_params(voting='soft').fit(X, y) assert_array_equal(eclf1.predict(X), eclf2.predict(X)) assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X)) msg = 'All estimators are None or "drop". At least one is required!' assert_raise_message( ValueError, msg, eclf2.set_params(lr=drop, rf=drop, nb=drop).fit, X, y) # Test soft voting transform X1 = np.array([[1], [2]]) y1 = np.array([1, 2]) eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[0, 0.5], flatten_transform=False).fit(X1, y1) eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)], voting='soft', weights=[1, 0.5], flatten_transform=False) eclf2.set_params(rf=drop).fit(X1, y1) assert_array_almost_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]], [[1., 0.], [0., 1.]]])) assert_array_almost_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]])) eclf1.set_params(voting='hard') eclf2.set_params(voting='hard') assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]])) assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) assert_raises(AttributeError, getattr, reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) assert_raises(AttributeError, getattr, clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
def test_plot_roc_curve(pyplot, response_method, data_binary, with_sample_weight, drop_intermediate): X, y = data_binary if with_sample_weight: rng = np.random.RandomState(42) sample_weight = rng.randint(1, 4, size=(X.shape[0])) else: sample_weight = None lr = LogisticRegression() lr.fit(X, y) viz = plot_roc_curve(lr, X, y, alpha=0.8, sample_weight=sample_weight, drop_intermediate=drop_intermediate) y_pred = getattr(lr, response_method)(X) if y_pred.ndim == 2: y_pred = y_pred[:, 1] fpr, tpr, _ = roc_curve(y, y_pred, sample_weight=sample_weight, drop_intermediate=drop_intermediate) assert_allclose(viz.roc_auc, auc(fpr, tpr)) assert_allclose(viz.fpr, fpr) assert_allclose(viz.tpr, tpr) assert viz.estimator_name == "LogisticRegression" # cannot fail thanks to pyplot fixture import matplotlib as mpl # noqal assert isinstance(viz.line_, mpl.lines.Line2D) assert viz.line_.get_alpha() == 0.8 assert isinstance(viz.ax_, mpl.axes.Axes) assert isinstance(viz.figure_, mpl.figure.Figure) expected_label = "LogisticRegression (AUC = {:0.2f})".format(viz.roc_auc) assert viz.line_.get_label() == expected_label assert viz.ax_.get_ylabel() == "True Positive Rate" assert viz.ax_.get_xlabel() == "False Positive Rate"
def test_2d_coef(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, n_classes=4) est = LogisticRegression() for threshold, func in zip(["mean", "median"], [np.mean, np.median]): for order in [1, 2, np.inf]: # Fit SelectFromModel a multi-class problem transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order) transformer.fit(X, y) assert hasattr(transformer.estimator_, 'coef_') X_new = transformer.transform(X) assert X_new.shape[1] < X.shape[1] # Manually check that the norm is correctly performed est.fit(X, y) importances = np.linalg.norm(est.coef_, axis=0, ord=order) feature_mask = importances > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask])