Exemple #1
0
def test_missing_values_resilience(problem, missing_proportion,
                                   expected_min_score_classification,
                                   expected_min_score_regression):
    # Make sure the estimators can deal with missing values and still yield
    # decent predictions

    rng = np.random.RandomState(0)
    n_samples = 1000
    n_features = 2
    if problem == 'regression':
        X, y = make_regression(n_samples=n_samples, n_features=n_features,
                               n_informative=n_features, random_state=rng)
        gb = HistGradientBoostingRegressor()
        expected_min_score = expected_min_score_regression
    else:
        X, y = make_classification(n_samples=n_samples, n_features=n_features,
                                   n_informative=n_features, n_redundant=0,
                                   n_repeated=0, random_state=rng)
        gb = HistGradientBoostingClassifier()
        expected_min_score = expected_min_score_classification

    mask = rng.binomial(1, missing_proportion, size=X.shape).astype(np.bool)
    X[mask] = np.nan

    gb.fit(X, y)

    assert gb.score(X, y) > expected_min_score
Exemple #2
0
def test_mean_variance_axis1():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_lil = sp.lil_matrix(X)
    X_lil[1, 0] = 0
    X[1, 0] = 0

    assert_raises(TypeError, mean_variance_axis, X_lil, axis=1)

    X_csr = sp.csr_matrix(X_lil)
    X_csc = sp.csc_matrix(X_lil)

    expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64),
                       (np.int32, np.float64), (np.int64, np.float64)]

    for input_dtype, output_dtype in expected_dtypes:
        X_test = X.astype(input_dtype)
        for X_sparse in (X_csr, X_csc):
            X_sparse = X_sparse.astype(input_dtype)
            X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
            assert X_means.dtype == output_dtype
            assert X_vars.dtype == output_dtype
            assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
            assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
Exemple #3
0
def test_linearsvc_parameters():
    # Test possible parameter combinations in LinearSVC
    # Generate list of possible parameter combinations
    losses = ['hinge', 'squared_hinge', 'logistic_regression', 'foo']
    penalties, duals = ['l1', 'l2', 'bar'], [True, False]

    X, y = make_classification(n_samples=5, n_features=5)

    for loss, penalty, dual in itertools.product(losses, penalties, duals):
        clf = svm.LinearSVC(penalty=penalty, loss=loss, dual=dual)
        if ((loss, penalty) == ('hinge', 'l1')
                or (loss, penalty, dual) == ('hinge', 'l2', False)
                or (penalty, dual) == ('l1', True) or loss == 'foo'
                or penalty == 'bar'):

            with pytest.raises(ValueError,
                               match="Unsupported set of "
                               "arguments.*penalty='%s.*loss='%s.*dual=%s" %
                               (penalty, loss, dual)):
                clf.fit(X, y)
        else:
            clf.fit(X, y)

    # Incorrect loss value - test if explicit error message is raised
    with pytest.raises(ValueError, match=".*loss='l3' is not supported.*"):
        svm.LinearSVC(loss="l3").fit(X, y)
Exemple #4
0
def test_mean_variance_illegal_axis():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_csr = sp.csr_matrix(X)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=2)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1)

    assert_raises(ValueError,
                  incr_mean_variance_axis,
                  X_csr,
                  axis=-3,
                  last_mean=None,
                  last_var=None,
                  last_n=None)
    assert_raises(ValueError,
                  incr_mean_variance_axis,
                  X_csr,
                  axis=2,
                  last_mean=None,
                  last_var=None,
                  last_n=None)
    assert_raises(ValueError,
                  incr_mean_variance_axis,
                  X_csr,
                  axis=-1,
                  last_mean=None,
                  last_var=None,
                  last_n=None)
Exemple #5
0
def test_partial_dependence_unknown_feature(estimator, features):
    X, y = make_classification(random_state=0)
    estimator.fit(X, y)

    err_msg = 'all features must be in'
    with pytest.raises(ValueError, match=err_msg):
        partial_dependence(estimator, X, [features])
Exemple #6
0
def test_label_propagation_closed_form():
    n_classes = 2
    X, y = make_classification(n_classes=n_classes,
                               n_samples=200,
                               random_state=0)
    y[::3] = -1
    Y = np.zeros((len(y), n_classes + 1))
    Y[np.arange(len(y)), y] = 1
    unlabelled_idx = Y[:, (-1, )].nonzero()[0]
    labelled_idx = (Y[:, (-1, )] == 0).nonzero()[0]

    clf = label_propagation.LabelPropagation(max_iter=10000, gamma=0.1)
    clf.fit(X, y)
    # adopting notation from Zhu et al 2002
    T_bar = clf._build_graph()
    Tuu = T_bar[tuple(
        np.meshgrid(unlabelled_idx, unlabelled_idx, indexing='ij'))]
    Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx,
                                  indexing='ij'))]
    Y = Y[:, :-1]
    Y_l = Y[labelled_idx, :]
    Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l)

    expected = Y.copy()
    expected[unlabelled_idx, :] = Y_u
    expected /= expected.sum(axis=1)[:, np.newaxis]

    assert_array_almost_equal(expected, clf.label_distributions_, 4)
Exemple #7
0
def test_warm_start_validation():
    X, y = make_classification(n_samples=30, n_features=5, n_classes=4,
                               n_redundant=0, n_informative=5, random_state=0)

    nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
    nca.fit(X, y)

    X_less_features, y = make_classification(n_samples=30, n_features=4,
                                             n_classes=4, n_redundant=0,
                                             n_informative=4, random_state=0)
    assert_raise_message(ValueError,
                         'The new inputs dimensionality ({}) does not '
                         'match the input dimensionality of the '
                         'previously learned transformation ({}).'
                         .format(X_less_features.shape[1],
                                 nca.components_.shape[1]),
                         nca.fit, X_less_features, y)
Exemple #8
0
def test_ovr_single_label_decision_function():
    X, Y = datasets.make_classification(n_samples=100,
                                        n_features=20,
                                        random_state=0)
    X_train, Y_train = X[:80], Y[:80]
    X_test = X[80:]
    clf = OneVsRestClassifier(svm.SVC()).fit(X_train, Y_train)
    assert_array_equal(
        clf.decision_function(X_test).ravel() > 0, clf.predict(X_test))
Exemple #9
0
 def __init__(self,
              name,
              construct,
              skip_methods=(),
              fit_args=make_classification()):
     self.name = name
     self.construct = construct
     self.fit_args = fit_args
     self.skip_methods = skip_methods
Exemple #10
0
def test_crammer_singer_binary():
    # Test Crammer-Singer formulation in the binary case
    X, y = make_classification(n_classes=2, random_state=0)

    for fit_intercept in (True, False):
        acc = svm.LinearSVC(fit_intercept=fit_intercept,
                            multi_class="crammer_singer",
                            random_state=0).fit(X, y).score(X, y)
        assert acc > 0.9
Exemple #11
0
def test_same_predictions_classification(seed, min_samples_leaf, n_samples,
                                         max_leaf_nodes):
    # Same as test_same_predictions_regression but for classification

    rng = np.random.RandomState(seed=seed)
    n_samples = n_samples
    max_iter = 1
    max_bins = 255

    X, y = make_classification(n_samples=n_samples,
                               n_classes=2,
                               n_features=5,
                               n_informative=5,
                               n_redundant=0,
                               random_state=0)

    if n_samples > 255:
        # bin data and convert it to float32 so that the estimator doesn't
        # treat it as pre-binned
        X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    est_mrex = HistGradientBoostingClassifier(
        loss='binary_crossentropy',
        max_iter=max_iter,
        max_bins=max_bins,
        learning_rate=1,
        n_iter_no_change=None,
        min_samples_leaf=min_samples_leaf,
        max_leaf_nodes=max_leaf_nodes)
    est_lightgbm = get_equivalent_estimator(est_mrex, lib='lightgbm')

    est_lightgbm.fit(X_train, y_train)
    est_mrex.fit(X_train, y_train)

    # We need X to be treated an numerical data, not pre-binned data.
    X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32)

    pred_lightgbm = est_lightgbm.predict(X_train)
    pred_mrex = est_mrex.predict(X_train)
    assert np.mean(pred_mrex == pred_lightgbm) > .89

    acc_lightgbm = accuracy_score(y_train, pred_lightgbm)
    acc_mrex = accuracy_score(y_train, pred_mrex)
    np.testing.assert_almost_equal(acc_lightgbm, acc_mrex)

    if max_leaf_nodes < 10 and n_samples >= 1000:

        pred_lightgbm = est_lightgbm.predict(X_test)
        pred_mrex = est_mrex.predict(X_test)
        assert np.mean(pred_mrex == pred_lightgbm) > .89

        acc_lightgbm = accuracy_score(y_test, pred_lightgbm)
        acc_mrex = accuracy_score(y_test, pred_mrex)
        np.testing.assert_almost_equal(acc_lightgbm, acc_mrex, decimal=2)
Exemple #12
0
def test_valid_alpha():
    n_classes = 2
    X, y = make_classification(n_classes=n_classes,
                               n_samples=200,
                               random_state=0)
    for alpha in [-0.1, 0, 1, 1.1, None]:
        assert_raises(ValueError,
                      lambda **kwargs: label_propagation.LabelSpreading(
                          **kwargs).fit(X, y),
                      alpha=alpha)
Exemple #13
0
def setup_module():
    # Create some memory mapped data
    global X_mm, y_mm, y_ml_mm, TEMP_FOLDER, ESTIMATORS
    TEMP_FOLDER = tempfile.mkdtemp(prefix='mrex_test_score_objects_')
    X, y = make_classification(n_samples=30, n_features=5, random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
                                             random_state=0)
    filename = os.path.join(TEMP_FOLDER, 'test_data.pkl')
    joblib.dump((X, y, y_ml), filename)
    X_mm, y_mm, y_ml_mm = joblib.load(filename, mmap_mode='r')
    ESTIMATORS = _make_estimators(X_mm, y_mm, y_ml_mm)
Exemple #14
0
def test_calibration_prob_sum():
    # Test that sum of probabilities is 1. A non-regression test for
    # issue #7796
    num_classes = 2
    X, y = make_classification(n_samples=10, n_features=5,
                               n_classes=num_classes)
    clf = LinearSVC(C=1.0)
    clf_prob = CalibratedClassifierCV(clf, method="sigmoid", cv=LeaveOneOut())
    clf_prob.fit(X, y)

    probs = clf_prob.predict_proba(X)
    assert_array_almost_equal(probs.sum(axis=1), np.ones(probs.shape[0]))
Exemple #15
0
def test_calibration_nan_imputer():
    """Test that calibration can accept nan"""
    X, y = make_classification(n_samples=10, n_features=2,
                               n_informative=2, n_redundant=0,
                               random_state=42)
    X[0, 0] = np.nan
    clf = Pipeline(
        [('imputer', SimpleImputer()),
         ('rf', RandomForestClassifier(n_estimators=1))])
    clf_c = CalibratedClassifierCV(clf, cv=2, method='isotonic')
    clf_c.fit(X, y)
    clf_c.predict(X)
Exemple #16
0
def get_estimator_and_data():
    if args.problem == 'classification':
        X, y = make_classification(args.n_samples_max * 2,
                                   n_features=args.n_features,
                                   n_classes=args.n_classes,
                                   n_clusters_per_class=1,
                                   random_state=0)
        return X, y, HistGradientBoostingClassifier
    elif args.problem == 'regression':
        X, y = make_regression(args.n_samples_max * 2,
                               n_features=args.n_features, random_state=0)
        return X, y, HistGradientBoostingRegressor
Exemple #17
0
def test_coef_default_threshold():
    X, y = datasets.make_classification(
        n_samples=100, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    # For the Lasso and related models, the threshold defaults to 1e-5
    transformer = SelectFromModel(estimator=Lasso(alpha=0.1,
                                  random_state=42))
    transformer.fit(X, y)
    X_new = transformer.transform(X)
    mask = np.abs(transformer.estimator_.coef_) > 1e-5
    assert_array_almost_equal(X_new, X[:, mask])
Exemple #18
0
def generate_multilabel_dataset_with_correlations():
    # Generate a multilabel data set from a multiclass dataset as a way of
    # by representing the integer number of the original class using a binary
    # encoding.
    X, y = make_classification(n_samples=1000,
                               n_features=100,
                               n_classes=16,
                               n_informative=10,
                               random_state=0)

    Y_multi = np.array([[int(yyy) for yyy in format(yy, '#06b')[2:]]
                        for yy in y])
    return X, Y_multi
Exemple #19
0
def test_make_classification():
    weights = [0.1, 0.25]
    X, y = make_classification(n_samples=100,
                               n_features=20,
                               n_informative=5,
                               n_redundant=1,
                               n_repeated=1,
                               n_classes=3,
                               n_clusters_per_class=1,
                               hypercube=False,
                               shift=None,
                               scale=None,
                               weights=weights,
                               random_state=0)

    assert weights == [0.1, 0.25]
    assert X.shape == (100, 20), "X shape mismatch"
    assert y.shape == (100, ), "y shape mismatch"
    assert np.unique(y).shape == (3, ), "Unexpected number of classes"
    assert sum(y == 0) == 10, "Unexpected number of samples in class #0"
    assert sum(y == 1) == 25, "Unexpected number of samples in class #1"
    assert sum(y == 2) == 65, "Unexpected number of samples in class #2"

    # Test for n_features > 30
    X, y = make_classification(n_samples=2000,
                               n_features=31,
                               n_informative=31,
                               n_redundant=0,
                               n_repeated=0,
                               hypercube=True,
                               scale=0.5,
                               random_state=0)

    assert X.shape == (2000, 31), "X shape mismatch"
    assert y.shape == (2000, ), "y shape mismatch"
    assert (np.unique(
        X.view([('', X.dtype)] * X.shape[1])).view(X.dtype).reshape(
            -1,
            X.shape[1]).shape[0] == 2000), ("Unexpected number of unique rows")
Exemple #20
0
def test_weight():
    # Test class weights
    X_, y_ = make_classification(n_samples=200,
                                 n_features=100,
                                 weights=[0.833, 0.167],
                                 random_state=0)

    X_ = sparse.csr_matrix(X_)
    for clf in (linear_model.LogisticRegression(),
                svm.LinearSVC(random_state=0), svm.SVC()):
        clf.set_params(class_weight={0: 5})
        clf.fit(X_[:180], y_[:180])
        y_pred = clf.predict(X_[180:])
        assert np.sum(y_pred == y_[180:]) >= 11
Exemple #21
0
def test_multiclass_multioutput(Estimator):
    # Make sure error is raised for multiclass-multioutput classifiers

    # make multiclass-multioutput dataset
    X, y = make_classification(n_classes=3,
                               n_clusters_per_class=1,
                               random_state=0)
    y = np.array([y, y]).T

    est = Estimator()
    est.fit(X, y)

    with pytest.raises(
            ValueError,
            match="Multiclass-multioutput estimators are not supported"):
        partial_dependence(est, X, [0])
Exemple #22
0
def test_scorer_sample_weight():
    # Test that scorers support sample_weight or raise sensible errors

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0],
                                             random_state=0)
    split = train_test_split(X, y, y_ml, random_state=0)
    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split

    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    estimator = _make_estimators(X_train, y_train, y_ml_train)

    for name, scorer in SCORERS.items():
        if name in MULTILABEL_ONLY_SCORERS:
            target = y_ml_test
        else:
            target = y_test
        if name in REQUIRE_POSITIVE_Y_SCORERS:
            target = _require_positive_y(target)
        try:
            weighted = scorer(estimator[name],
                              X_test,
                              target,
                              sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], target[10:])
            unweighted = scorer(estimator[name], X_test, target)
            assert weighted != unweighted, (
                "scorer {0} behaves identically when "
                "called with sample weights: {1} vs "
                "{2}".format(name, weighted, unweighted))
            assert_almost_equal(weighted,
                                ignored,
                                err_msg="scorer {0} behaves differently when "
                                "ignoring samples and setting sample_weight to"
                                " 0: {1} vs {2}".format(
                                    name, weighted, ignored))

        except TypeError as e:
            assert "sample_weight" in str(e), (
                "scorer {0} raises unhelpful exception when called "
                "with sample weights: {1}".format(name, str(e)))
Exemple #23
0
def test_max_features_tiebreak():
    # Test if max_features can break tie among feature importance
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    max_features = X.shape[1]

    feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
    for n_features in range(1, max_features + 1):
        transformer = SelectFromModel(
            FixedImportanceEstimator(feature_importances),
            max_features=n_features,
            threshold=-np.inf)
        X_new = transformer.fit_transform(X, y)
        selected_feature_indices = np.where(transformer._get_support_mask())[0]
        assert_array_equal(selected_feature_indices, np.arange(n_features))
        assert X_new.shape[1] == n_features
Exemple #24
0
def test_feature_importances():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        transformer = SelectFromModel(estimator=est, threshold=threshold)
        transformer.fit(X, y)
        assert hasattr(transformer.estimator_, 'feature_importances_')

        X_new = transformer.transform(X)
        assert X_new.shape[1] < X.shape[1]
        importances = transformer.estimator_.feature_importances_

        feature_mask = np.abs(importances) > func(importances)
        assert_array_almost_equal(X_new, X[:, feature_mask])
Exemple #25
0
def test_sag_classifier_raises_error(solver):
    # Following #13316, the error handling behavior changed in cython sag. This
    # is simply a non-regression test to make sure numerical errors are
    # properly raised.

    # Train a classifier on a simple problem
    rng = np.random.RandomState(42)
    X, y = make_classification(random_state=rng)
    clf = LogisticRegression(solver=solver, random_state=rng, warm_start=True)
    clf.fit(X, y)

    # Trigger a numerical error by:
    # - corrupting the fitted coefficients of the classifier
    # - fit it again starting from its current state thanks to warm_start
    clf.coef_[:] = np.nan

    with pytest.raises(ValueError, match="Floating-point under-/overflow"):
        clf.fit(X, y)
Exemple #26
0
def test_importances():
    # Check variable importances.
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=1)

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)

        clf.fit(X, y)
        importances = clf.feature_importances_

        assert importances.shape[0] == 10
        assert (importances[:3, np.newaxis] >= importances[3:]).all()
Exemple #27
0
def test_label_spreading_closed_form():
    n_classes = 2
    X, y = make_classification(n_classes=n_classes,
                               n_samples=200,
                               random_state=0)
    y[::3] = -1
    clf = label_propagation.LabelSpreading().fit(X, y)
    # adopting notation from Zhou et al (2004):
    S = clf._build_graph()
    Y = np.zeros((len(y), n_classes + 1))
    Y[np.arange(len(y)), y] = 1
    Y = Y[:, :-1]
    for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
        expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
        expected /= expected.sum(axis=1)[:, np.newaxis]
        clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
        clf.fit(X, y)
        assert_array_almost_equal(expected, clf.label_distributions_, 4)
Exemple #28
0
def test_weight():
    # Test class weights
    clf = svm.SVC(class_weight={1: 0.1})
    # we give a small weights to class 1
    clf.fit(X, Y)
    # so all predicted values belong to class 2
    assert_array_almost_equal(clf.predict(X), [2] * 6)

    X_, y_ = make_classification(n_samples=200,
                                 n_features=10,
                                 weights=[0.833, 0.167],
                                 random_state=2)

    for clf in (linear_model.LogisticRegression(),
                svm.LinearSVC(random_state=0), svm.SVC()):
        clf.set_params(class_weight={0: .1, 1: 10})
        clf.fit(X_[:100], y_[:100])
        y_pred = clf.predict(X_[100:])
        assert f1_score(y_[100:], y_pred) > .3
Exemple #29
0
def test_threshold_and_max_features():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    est = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer1 = SelectFromModel(estimator=est, max_features=3,
                                   threshold=-np.inf)
    X_new1 = transformer1.fit_transform(X, y)

    transformer2 = SelectFromModel(estimator=est, threshold=0.04)
    X_new2 = transformer2.fit_transform(X, y)

    transformer3 = SelectFromModel(estimator=est, max_features=3,
                                   threshold=0.04)
    X_new3 = transformer3.fit_transform(X, y)
    assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
    selected_indices = transformer3.transform(
        np.arange(X.shape[1])[np.newaxis, :])
    assert_allclose(X_new3, X[:, selected_indices[0]])
Exemple #30
0
def test_sample_weight():
    # Ensure sample weights are passed to underlying estimator
    X, y = datasets.make_classification(
        n_samples=100, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    # Check with sample weights
    sample_weight = np.ones(y.shape)
    sample_weight[y == 1] *= 100

    est = LogisticRegression(random_state=0, fit_intercept=False)
    transformer = SelectFromModel(estimator=est)
    transformer.fit(X, y, sample_weight=None)
    mask = transformer._get_support_mask()
    transformer.fit(X, y, sample_weight=sample_weight)
    weighted_mask = transformer._get_support_mask()
    assert not np.all(weighted_mask == mask)
    transformer.fit(X, y, sample_weight=3 * sample_weight)
    reweighted_mask = transformer._get_support_mask()
    assert np.all(weighted_mask == reweighted_mask)