def test_missing_values_resilience(problem, missing_proportion, expected_min_score_classification, expected_min_score_regression): # Make sure the estimators can deal with missing values and still yield # decent predictions rng = np.random.RandomState(0) n_samples = 1000 n_features = 2 if problem == 'regression': X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_features, random_state=rng) gb = HistGradientBoostingRegressor() expected_min_score = expected_min_score_regression else: X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_features, n_redundant=0, n_repeated=0, random_state=rng) gb = HistGradientBoostingClassifier() expected_min_score = expected_min_score_classification mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool) X[mask] = np.nan gb.fit(X, y) assert gb.score(X, y) > expected_min_score
def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_lil = sp.lil_matrix(X) X_lil[1, 0] = 0 X[1, 0] = 0 assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) X_csr = sp.csr_matrix(X_lil) X_csc = sp.csc_matrix(X_lil) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), (np.int32, np.float64), (np.int64, np.float64)] for input_dtype, output_dtype in expected_dtypes: X_test = X.astype(input_dtype) for X_sparse in (X_csr, X_csc): X_sparse = X_sparse.astype(input_dtype) X_means, X_vars = mean_variance_axis(X_sparse, axis=0) assert X_means.dtype == output_dtype assert X_vars.dtype == output_dtype assert_array_almost_equal(X_means, np.mean(X_test, axis=0)) assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
def test_linearsvc_parameters(): # Test possible parameter combinations in LinearSVC # Generate list of possible parameter combinations losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo'] penalties, duals = ['l1', 'l2', 'bar'], [True, False] X, y = make_classification(n_samples=5, n_features=5) for loss, penalty, dual in itertools.product(losses, penalties, duals): clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual) if ((loss, penalty) == ('hinge', 'l1') or (loss, penalty, dual) == ('hinge', 'l2', False) or (penalty, dual) == ('l1', True) or loss == 'foo' or penalty == 'bar'): with pytest.raises(ValueError, match="Unsupported set of " "arguments.*penalty='%s.*loss='%s.*dual=%s" % (penalty, loss, dual)): clf.fit(X, y) else: clf.fit(X, y) # Incorrect loss value - test if explicit error message is raised with pytest.raises(ValueError, match=".*loss='l3' is not supported.*"): svm.LinearSVC(loss="l3").fit(X, y)
def test_mean_variance_illegal_axis(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_csr = sp.csr_matrix(X) assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3) assert_raises(ValueError, mean_variance_axis, X_csr, axis=2) assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3, last_mean=None, last_var=None, last_n=None) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2, last_mean=None, last_var=None, last_n=None) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1, last_mean=None, last_var=None, last_n=None)
def test_partial_dependence_unknown_feature(estimator, features): X, y = make_classification(random_state=0) estimator.fit(X, y) err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features])
def test_label_propagation_closed_form(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) y[::3] = -1 Y = np.zeros((len(y), n_classes + 1)) Y[np.arange(len(y)), y] = 1 unlabelled_idx = Y[:, (-1, )].nonzero()[0] labelled_idx = (Y[:, (-1, )] == 0).nonzero()[0] clf = label_propagation.LabelPropagation(max_iter=10000, gamma=0.1) clf.fit(X, y) # adopting notation from Zhu et al 2002 T_bar = clf._build_graph() Tuu = T_bar[tuple( np.meshgrid(unlabelled_idx, unlabelled_idx, indexing='ij'))] Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx, indexing='ij'))] Y = Y[:, :-1] Y_l = Y[labelled_idx, :] Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l) expected = Y.copy() expected[unlabelled_idx, :] = Y_u expected /= expected.sum(axis=1)[:, np.newaxis] assert_array_almost_equal(expected, clf.label_distributions_, 4)
def test_warm_start_validation(): X, y = make_classification(n_samples=30, n_features=5, n_classes=4, n_redundant=0, n_informative=5, random_state=0) nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5) nca.fit(X, y) X_less_features, y = make_classification(n_samples=30, n_features=4, n_classes=4, n_redundant=0, n_informative=4, random_state=0) assert_raise_message(ValueError, 'The new inputs dimensionality ({}) does not ' 'match the input dimensionality of the ' 'previously learned transformation ({}).' .format(X_less_features.shape[1], nca.components_.shape[1]), nca.fit, X_less_features, y)
def test_ovr_single_label_decision_function(): X, Y = datasets.make_classification(n_samples=100, n_features=20, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test = X[80:] clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train) assert_array_equal( clf.decision_function(X_test).ravel() > 0, clf.predict(X_test))
def __init__(self, name, construct, skip_methods=(), fit_args=make_classification()): self.name = name self.construct = construct self.fit_args = fit_args self.skip_methods = skip_methods
def test_crammer_singer_binary(): # Test Crammer-Singer formulation in the binary case X, y = make_classification(n_classes=2, random_state=0) for fit_intercept in (True, False): acc = svm.LinearSVC(fit_intercept=fit_intercept, multi_class="crammer_singer", random_state=0).fit(X, y).score(X, y) assert acc > 0.9
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 255 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_mrex = HistGradientBoostingClassifier( loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_mrex, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_mrex.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_mrex = est_mrex.predict(X_train) assert np.mean(pred_mrex == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_mrex = accuracy_score(y_train, pred_mrex) np.testing.assert_almost_equal(acc_lightgbm, acc_mrex) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_mrex = est_mrex.predict(X_test) assert np.mean(pred_mrex == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_mrex = accuracy_score(y_test, pred_mrex) np.testing.assert_almost_equal(acc_lightgbm, acc_mrex, decimal=2)
def test_valid_alpha(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) for alpha in [-0.1, 0, 1, 1.1, None]: assert_raises(ValueError, lambda **kwargs: label_propagation.LabelSpreading( **kwargs).fit(X, y), alpha=alpha)
def setup_module(): # Create some memory mapped data global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS TEMP_FOLDER = tempfile.mkdtemp(prefix='mrex_test_score_objects_') X, y = make_classification(n_samples=30, n_features=5, random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) filename = os.path.join(TEMP_FOLDER, 'test_data.pkl') joblib.dump((X, y, y_ml), filename) X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode='r') ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
def test_calibration_prob_sum(): # Test that sum of probabilities is 1. A non-regression test for # issue #7796 num_classes = 2 X, y = make_classification(n_samples=10, n_features=5, n_classes=num_classes) clf = LinearSVC(C=1.0) clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut()) clf_prob.fit(X, y) probs = clf_prob.predict_proba(X) assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0]))
def test_calibration_nan_imputer(): """Test that calibration can accept nan""" X, y = make_classification(n_samples=10, n_features=2, n_informative=2, n_redundant=0, random_state=42) X[0, 0] = np.nan clf = Pipeline( [('imputer', SimpleImputer()), ('rf', RandomForestClassifier(n_estimators=1))]) clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic') clf_c.fit(X, y) clf_c.predict(X)
def get_estimator_and_data(): if args.problem == 'classification': X, y = make_classification(args.n_samples_max * 2, n_features=args.n_features, n_classes=args.n_classes, n_clusters_per_class=1, random_state=0) return X, y, HistGradientBoostingClassifier elif args.problem == 'regression': X, y = make_regression(args.n_samples_max * 2, n_features=args.n_features, random_state=0) return X, y, HistGradientBoostingRegressor
def test_coef_default_threshold(): X, y = datasets.make_classification( n_samples=100, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # For the Lasso and related models, the threshold defaults to 1e-5 transformer = SelectFromModel(estimator=Lasso(alpha=0.1, random_state=42)) transformer.fit(X, y) X_new = transformer.transform(X) mask = np.abs(transformer.estimator_.coef_) > 1e-5 assert_array_almost_equal(X_new, X[:, mask])
def generate_multilabel_dataset_with_correlations(): # Generate a multilabel data set from a multiclass dataset as a way of # by representing the integer number of the original class using a binary # encoding. X, y = make_classification(n_samples=1000, n_features=100, n_classes=16, n_informative=10, random_state=0) Y_multi = np.array([[int(yyy) for yyy in format(yy, '#06b')[2:]] for yy in y]) return X, Y_multi
def test_make_classification(): weights = [0.1, 0.25] X, y = make_classification(n_samples=100, n_features=20, n_informative=5, n_redundant=1, n_repeated=1, n_classes=3, n_clusters_per_class=1, hypercube=False, shift=None, scale=None, weights=weights, random_state=0) assert weights == [0.1, 0.25] assert X.shape == (100, 20), "X shape mismatch" assert y.shape == (100, ), "y shape mismatch" assert np.unique(y).shape == (3, ), "Unexpected number of classes" assert sum(y == 0) == 10, "Unexpected number of samples in class #0" assert sum(y == 1) == 25, "Unexpected number of samples in class #1" assert sum(y == 2) == 65, "Unexpected number of samples in class #2" # Test for n_features > 30 X, y = make_classification(n_samples=2000, n_features=31, n_informative=31, n_redundant=0, n_repeated=0, hypercube=True, scale=0.5, random_state=0) assert X.shape == (2000, 31), "X shape mismatch" assert y.shape == (2000, ), "y shape mismatch" assert (np.unique( X.view([('', X.dtype)] * X.shape[1])).view(X.dtype).reshape( -1, X.shape[1]).shape[0] == 2000), ("Unexpected number of unique rows")
def test_weight(): # Test class weights X_, y_ = make_classification(n_samples=200, n_features=100, weights=[0.833, 0.167], random_state=0) X_ = sparse.csr_matrix(X_) for clf in (linear_model.LogisticRegression(), svm.LinearSVC(random_state=0), svm.SVC()): clf.set_params(class_weight={0: 5}) clf.fit(X_[:180], y_[:180]) y_pred = clf.predict(X_[180:]) assert np.sum(y_pred == y_[180:]) >= 11
def test_multiclass_multioutput(Estimator): # Make sure error is raised for multiclass-multioutput classifiers # make multiclass-multioutput dataset X, y = make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) y = np.array([y, y]).T est = Estimator() est.fit(X, y) with pytest.raises( ValueError, match="Multiclass-multioutput estimators are not supported"): partial_dependence(est, X, [0])
def test_scorer_sample_weight(): # Test that scorers support sample_weight or raise sensible errors # Unlike the metrics invariance test, in the scorer case it's harder # to ensure that, on the classifier output, weighted and unweighted # scores really should be unequal. X, y = make_classification(random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) split = train_test_split(X, y, y_ml, random_state=0) X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split sample_weight = np.ones_like(y_test) sample_weight[:10] = 0 # get sensible estimators for each metric estimator = _make_estimators(X_train, y_train, y_ml_train) for name, scorer in SCORERS.items(): if name in MULTILABEL_ONLY_SCORERS: target = y_ml_test else: target = y_test if name in REQUIRE_POSITIVE_Y_SCORERS: target = _require_positive_y(target) try: weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], target[10:]) unweighted = scorer(estimator[name], X_test, target) assert weighted != unweighted, ( "scorer {0} behaves identically when " "called with sample weights: {1} vs " "{2}".format(name, weighted, unweighted)) assert_almost_equal(weighted, ignored, err_msg="scorer {0} behaves differently when " "ignoring samples and setting sample_weight to" " 0: {1} vs {2}".format( name, weighted, ignored)) except TypeError as e: assert "sample_weight" in str(e), ( "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)))
def test_max_features_tiebreak(): # Test if max_features can break tie among feature importance X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) max_features = X.shape[1] feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1]) for n_features in range(1, max_features + 1): transformer = SelectFromModel( FixedImportanceEstimator(feature_importances), max_features=n_features, threshold=-np.inf) X_new = transformer.fit_transform(X, y) selected_feature_indices = np.where(transformer._get_support_mask())[0] assert_array_equal(selected_feature_indices, np.arange(n_features)) assert X_new.shape[1] == n_features
def test_feature_importances(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) for threshold, func in zip(["mean", "median"], [np.mean, np.median]): transformer = SelectFromModel(estimator=est, threshold=threshold) transformer.fit(X, y) assert hasattr(transformer.estimator_, 'feature_importances_') X_new = transformer.transform(X) assert X_new.shape[1] < X.shape[1] importances = transformer.estimator_.feature_importances_ feature_mask = np.abs(importances) > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask])
def test_sag_classifier_raises_error(solver): # Following #13316, the error handling behavior changed in cython sag. This # is simply a non-regression test to make sure numerical errors are # properly raised. # Train a classifier on a simple problem rng = np.random.RandomState(42) X, y = make_classification(random_state=rng) clf = LogisticRegression(solver=solver, random_state=rng, warm_start=True) clf.fit(X, y) # Trigger a numerical error by: # - corrupting the fitted coefficients of the classifier # - fit it again starting from its current state thanks to warm_start clf.coef_[:] = np.nan with pytest.raises(ValueError, match="Floating-point under-/overflow"): clf.fit(X, y)
def test_importances(): # Check variable importances. X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=1) for alg in ['SAMME', 'SAMME.R']: clf = AdaBoostClassifier(algorithm=alg) clf.fit(X, y) importances = clf.feature_importances_ assert importances.shape[0] == 10 assert (importances[:3, np.newaxis] >= importances[3:]).all()
def test_label_spreading_closed_form(): n_classes = 2 X, y = make_classification(n_classes=n_classes, n_samples=200, random_state=0) y[::3] = -1 clf = label_propagation.LabelSpreading().fit(X, y) # adopting notation from Zhou et al (2004): S = clf._build_graph() Y = np.zeros((len(y), n_classes + 1)) Y[np.arange(len(y)), y] = 1 Y = Y[:, :-1] for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]: expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y) expected /= expected.sum(axis=1)[:, np.newaxis] clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha) clf.fit(X, y) assert_array_almost_equal(expected, clf.label_distributions_, 4)
def test_weight(): # Test class weights clf = svm.SVC(class_weight={1: 0.1}) # we give a small weights to class 1 clf.fit(X, Y) # so all predicted values belong to class 2 assert_array_almost_equal(clf.predict(X), [2] * 6) X_, y_ = make_classification(n_samples=200, n_features=10, weights=[0.833, 0.167], random_state=2) for clf in (linear_model.LogisticRegression(), svm.LinearSVC(random_state=0), svm.SVC()): clf.set_params(class_weight={0: .1, 1: 10}) clf.fit(X_[:100], y_[:100]) y_pred = clf.predict(X_[100:]) assert f1_score(y_[100:], y_pred) > .3
def test_threshold_and_max_features(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) transformer1 = SelectFromModel(estimator=est, max_features=3, threshold=-np.inf) X_new1 = transformer1.fit_transform(X, y) transformer2 = SelectFromModel(estimator=est, threshold=0.04) X_new2 = transformer2.fit_transform(X, y) transformer3 = SelectFromModel(estimator=est, max_features=3, threshold=0.04) X_new3 = transformer3.fit_transform(X, y) assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1]) selected_indices = transformer3.transform( np.arange(X.shape[1])[np.newaxis, :]) assert_allclose(X_new3, X[:, selected_indices[0]])
def test_sample_weight(): # Ensure sample weights are passed to underlying estimator X, y = datasets.make_classification( n_samples=100, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 est = LogisticRegression(random_state=0, fit_intercept=False) transformer = SelectFromModel(estimator=est) transformer.fit(X, y, sample_weight=None) mask = transformer._get_support_mask() transformer.fit(X, y, sample_weight=sample_weight) weighted_mask = transformer._get_support_mask() assert not np.all(weighted_mask == mask) transformer.fit(X, y, sample_weight=3 * sample_weight) reweighted_mask = transformer._get_support_mask() assert np.all(weighted_mask == reweighted_mask)