def test_cross_val_predict_input_types():
    clf = Ridge()
    # Smoke test
    predictions = cval.cross_val_predict(clf, X, y)
    assert_equal(predictions.shape, (10,))

    # test with multioutput y
    with ignore_warnings(category=ConvergenceWarning):
        predictions = cval.cross_val_predict(clf, X_sparse, X)
    assert_equal(predictions.shape, (10, 2))

    predictions = cval.cross_val_predict(clf, X_sparse, y)
    assert_array_equal(predictions.shape, (10,))

    # test with multioutput y
    with ignore_warnings(category=ConvergenceWarning):
        predictions = cval.cross_val_predict(clf, X_sparse, X)
    assert_array_equal(predictions.shape, (10, 2))

    # test with X and y as list
    list_check = lambda x: isinstance(x, list)
    clf = CheckingClassifier(check_X=list_check)
    predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist())

    clf = CheckingClassifier(check_y=list_check)
    predictions = cval.cross_val_predict(clf, X, y.tolist())

    # test with 3d X and
    X_3d = X[:, :, np.newaxis]
    check_3d = lambda x: x.ndim == 3
    clf = CheckingClassifier(check_X=check_3d)
    predictions = cval.cross_val_predict(clf, X_3d, y)
    assert_array_equal(predictions.shape, (10,))
def test_fit():
    # Checks whether `fit` method sets all attribute values correctly.
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        n_estimators=n_estimators)
    ignore_warnings(lshf.fit)(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))
def test_enet_l1_ratio():
    # Test that an error message is raised if an estimator that
    # uses _alpha_grid is called with l1_ratio=0
    msg = ("Automatic alpha grid generation is not supported for l1_ratio=0. "
           "Please supply a grid by providing your estimator with the "
           "appropriate `alphas=` argument.")
    X = np.array([[1, 2, 4, 5, 8], [3, 5, 7, 7, 8]]).T
    y = np.array([12, 10, 11, 21, 5])

    assert_raise_message(ValueError, msg, ElasticNetCV(
        l1_ratio=0, random_state=42).fit, X, y)
    assert_raise_message(ValueError, msg, MultiTaskElasticNetCV(
        l1_ratio=0, random_state=42).fit, X, y[:, None])

    # Test that l1_ratio=0 is allowed if we supply a grid manually
    alphas = [0.1, 10]
    estkwds = {'alphas': alphas, 'random_state': 42}
    est_desired = ElasticNetCV(l1_ratio=0.00001, **estkwds)
    est = ElasticNetCV(l1_ratio=0, **estkwds)
    with ignore_warnings():
        est_desired.fit(X, y)
        est.fit(X, y)
    assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)

    est_desired = MultiTaskElasticNetCV(l1_ratio=0.00001, **estkwds)
    est = MultiTaskElasticNetCV(l1_ratio=0, **estkwds)
    with ignore_warnings():
        est.fit(X, y[:, None])
        est_desired.fit(X, y[:, None])
    assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)
Exemple #4
0
def check_warm_start_oob(name):
    # Test that the warm start computes oob score when asked.
    X, y = hastie_X, hastie_y
    ForestEstimator = FOREST_ESTIMATORS[name]
    # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning.
    clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False,
                          random_state=1, bootstrap=True, oob_score=True)
    clf.fit(X, y)

    clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False,
                            random_state=1, bootstrap=True, oob_score=False)
    clf_2.fit(X, y)

    clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15)
    clf_2.fit(X, y)

    assert_true(hasattr(clf_2, 'oob_score_'))
    assert_equal(clf.oob_score_, clf_2.oob_score_)

    # Test that oob_score is computed even if we don't need to train
    # additional trees.
    clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True,
                            random_state=1, bootstrap=True, oob_score=False)
    clf_3.fit(X, y)
    assert_true(not(hasattr(clf_3, 'oob_score_')))

    clf_3.set_params(oob_score=True)
    ignore_warnings(clf_3.fit)(X, y)

    assert_equal(clf.oob_score_, clf_3.oob_score_)
def test_no_atoms():
    y_empty = np.zeros_like(y)
    Xy_empty = np.dot(X.T, y_empty)
    gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, 1)
    gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, 1)
    assert_equal(np.all(gamma_empty == 0), True)
    assert_equal(np.all(gamma_empty_gram == 0), True)
def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
            n_candidates=500, n_estimators=t)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
def test_check_estimator_clones():
    # check that check_estimator doesn't modify the estimator it receives
    from sklearn.datasets import load_iris
    iris = load_iris()

    for Estimator in [GaussianMixture, LinearRegression,
                      RandomForestClassifier, NMF, SGDClassifier,
                      MiniBatchKMeans]:
        with ignore_warnings(category=FutureWarning):
            # when 'est = SGDClassifier()'
            est = Estimator()
        set_checking_parameters(est)
        set_random_state(est)
        # without fitting
        old_hash = joblib.hash(est)
        check_estimator(est)
        assert_equal(old_hash, joblib.hash(est))

        with ignore_warnings(category=FutureWarning):
            # when 'est = SGDClassifier()'
            est = Estimator()
        set_checking_parameters(est)
        set_random_state(est)
        # with fitting
        est.fit(iris.data + 10, iris.target)
        old_hash = joblib.hash(est)
        check_estimator(est)
        assert_equal(old_hash, joblib.hash(est))
def test_hash_functions():
    # Checks randomness of hash functions.
    # Variance and mean of each hash function (projection vector)
    # should be different from flattened array of hash functions.
    # If hash functions are not randomly built (seeded with
    # same value), variances and means of all functions are equal.
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        n_estimators=n_estimators,
        random_state=rng.randint(0, np.iinfo(np.int32).max))
    ignore_warnings(lshf.fit)(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))
def test_sparse_input():
    # note: Fixed random state in sp.rand is not supported in older scipy.
    #       The test should succeed regardless.
    X1 = sp.rand(50, 100)
    X2 = sp.rand(10, 100)
    forest_sparse = ignore_warnings(LSHForest, category=DeprecationWarning)(
        radius=1, random_state=0).fit(X1)
    forest_dense = ignore_warnings(LSHForest, category=DeprecationWarning)(
        radius=1, random_state=0).fit(X1.A)

    d_sparse, i_sparse = forest_sparse.kneighbors(X2, return_distance=True)
    d_dense, i_dense = forest_dense.kneighbors(X2.A, return_distance=True)

    assert_almost_equal(d_sparse, d_dense)
    assert_almost_equal(i_sparse, i_dense)

    d_sparse, i_sparse = forest_sparse.radius_neighbors(X2,
                                                        return_distance=True)
    d_dense, i_dense = forest_dense.radius_neighbors(X2.A,
                                                     return_distance=True)
    assert_equal(d_sparse.shape, d_dense.shape)
    for a, b in zip(d_sparse, d_dense):
        assert_almost_equal(a, b)
    for a, b in zip(i_sparse, i_dense):
        assert_almost_equal(a, b)
def test_enet_float_precision():
    # Generate dataset
    X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10)
    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests

    for normalize in [True, False]:
        for fit_intercept in [True, False]:
            coef = {}
            intercept = {}
            clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False,
                            fit_intercept=fit_intercept, normalize=normalize)
            for dtype in [np.float64, np.float32]:
                X = dtype(X)
                y = dtype(y)
                ignore_warnings(clf.fit)(X, y)

                coef[dtype] = clf.coef_
                intercept[dtype] = clf.intercept_

                assert_equal(clf.coef_.dtype, dtype)

            assert_array_almost_equal(coef[np.float32], coef[np.float64],
                                    decimal=4)
            assert_array_almost_equal(intercept[np.float32],
                                    intercept[np.float64],
                                    decimal=4)
def test_enet_path():
    # We use a large number of samples and of informative features so that
    # the l1_ratio selected is more toward ridge than lasso
    X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100,
                                         n_informative_features=100)
    max_iter = 150

    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests
    clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3,
                       max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have seleted an elastic-net
    # that is closer to ridge than to lasso
    assert_equal(clf.l1_ratio_, min(clf.l1_ratio))

    clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3,
                       max_iter=max_iter, precompute=True)
    ignore_warnings(clf.fit)(X, y)


    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have seleted an elastic-net
    # that is closer to ridge than to lasso
    assert_equal(clf.l1_ratio_, min(clf.l1_ratio))

    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert_greater(clf.score(X_test, y_test), 0.99)
def test_enet_toy_list_input():
    # Test ElasticNet for various values of alpha and l1_ratio with list X

    X = np.array([[-1], [0], [1]])
    X = sp.csc_matrix(X)
    Y = [-1, 0, 1]       # just a straight line
    T = np.array([[2], [3], [4]])  # test sample

    # this should be the same as unregularized least squares
    clf = ElasticNet(alpha=0, l1_ratio=1.0)
    # catch warning about alpha=0.
    # this is discouraged but should work.
    ignore_warnings(clf.fit)(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [1])
    assert_array_almost_equal(pred, [2, 3, 4])
    assert_almost_equal(clf.dual_gap_, 0)

    clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=1000)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [0.50819], decimal=3)
    assert_array_almost_equal(pred, [1.0163,  1.5245,  2.0327], decimal=3)
    assert_almost_equal(clf.dual_gap_, 0)

    clf = ElasticNet(alpha=0.5, l1_ratio=0.5)
    clf.fit(X, Y)
    pred = clf.predict(T)
    assert_array_almost_equal(clf.coef_, [0.45454], 3)
    assert_array_almost_equal(pred, [0.9090,  1.3636,  1.8181], 3)
    assert_almost_equal(clf.dual_gap_, 0)
Exemple #13
0
def test_rfecv():
    generator = check_random_state(0)
    iris = load_iris()
    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
    y = list(iris.target)  # regression test: list should be supported

    # Test using the score function
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    rfecv.fit(X, y)
    # non-regression test for missing worst feature:
    assert_equal(len(rfecv.grid_scores_), X.shape[1])
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)

    # All the noisy variable were filtered out
    assert_array_equal(X_r, iris.data)

    # same in sparse
    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)

    # Test using a customized loss function
    scoring = make_scorer(zero_one_loss, greater_is_better=False)
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring)
    ignore_warnings(rfecv.fit)(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test using a scorer
    scorer = get_scorer("accuracy")
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer)
    rfecv.fit(X, y)
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    # Test fix on grid_scores
    def test_scorer(estimator, X, y):
        return 1.0

    rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer)
    rfecv.fit(X, y)
    assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_)))

    # Same as the first two tests, but with step=2
    rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    rfecv.fit(X, y)
    assert_equal(len(rfecv.grid_scores_), 6)
    assert_equal(len(rfecv.ranking_), X.shape[1])
    X_r = rfecv.transform(X)
    assert_array_equal(X_r, iris.data)

    rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5)
    X_sparse = sparse.csr_matrix(X)
    rfecv_sparse.fit(X_sparse, y)
    X_r_sparse = rfecv_sparse.transform(X_sparse)
    assert_array_equal(X_r_sparse.toarray(), iris.data)
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
def test_kernel_ridge_singular_kernel():
    # alpha=0 causes a LinAlgError in computing the dual coefficients,
    # which causes a fallback to a lstsq solver. This is tested here.
    pred = Ridge(alpha=0, fit_intercept=False).fit(X, y).predict(X)
    kr = KernelRidge(kernel="linear", alpha=0)
    ignore_warnings(kr.fit)(X, y)
    pred2 = kr.predict(X)
    assert_array_almost_equal(pred, pred2)
def test_consistent_proba():
    a = svm.SVC(probability=True, max_iter=1, random_state=0)
    with ignore_warnings(category=ConvergenceWarning):
        proba_1 = a.fit(X, Y).predict_proba(X)
    a = svm.SVC(probability=True, max_iter=1, random_state=0)
    with ignore_warnings(category=ConvergenceWarning):
        proba_2 = a.fit(X, Y).predict_proba(X)
    assert_array_almost_equal(proba_1, proba_2)
def test_nans():
    # Assert that SelectKBest and SelectPercentile can handle NaNs.
    # First feature has zero variance to confuse f_classif (ANOVA) and
    # make it return a NaN.
    X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]]
    y = [1, 0, 1]

    for select in (SelectKBest(f_classif, 2), SelectPercentile(f_classif, percentile=67)):
        ignore_warnings(select.fit)(X, y)
        assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
def _run_one_hot(X, X2, cat):
    # enc = assert_warns(
    #     DeprecationWarning,
    #     OneHotEncoder, categorical_features=cat)
    enc = OneHotEncoder(categorical_features=cat)
    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
        Xtr = enc.fit_transform(X)
    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
        X2tr = enc.fit(X).transform(X2)
    return Xtr, X2tr
def test_path_parameters():
    X, y = make_sparse_data()
    max_iter = 50
    n_alphas = 10
    clf = ElasticNetCV(n_alphas=n_alphas, eps=1e-3, max_iter=max_iter, l1_ratio=0.5, fit_intercept=False)
    ignore_warnings(clf.fit)(X, y)  # new params
    assert_almost_equal(0.5, clf.l1_ratio)
    assert_equal(n_alphas, clf.n_alphas)
    assert_equal(n_alphas, len(clf.alphas_))
    sparse_mse_path = clf.mse_path_
    ignore_warnings(clf.fit)(X.toarray(), y)  # compare with dense data
    assert_almost_equal(clf.mse_path_, sparse_mse_path)
def test_enet_float_precision():
    # Generate dataset
    X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10)
    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests

    for normalize in [True, False]:
        for fit_intercept in [True, False]:
            coef = {}
            intercept = {}
            for dtype in [np.float64, np.float32]:
                clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False,
                                 fit_intercept=fit_intercept,
                                 normalize=normalize)

                X = dtype(X)
                y = dtype(y)
                ignore_warnings(clf.fit)(X, y)

                coef[('simple', dtype)] = clf.coef_
                intercept[('simple', dtype)] = clf.intercept_

                assert_equal(clf.coef_.dtype, dtype)

                # test precompute Gram array
                Gram = X.T.dot(X)
                clf_precompute = ElasticNet(alpha=0.5, max_iter=100,
                                            precompute=Gram,
                                            fit_intercept=fit_intercept,
                                            normalize=normalize)
                ignore_warnings(clf_precompute.fit)(X, y)
                assert_array_almost_equal(clf.coef_, clf_precompute.coef_)
                assert_array_almost_equal(clf.intercept_,
                                          clf_precompute.intercept_)

                # test multi task enet
                multi_y = np.hstack((y[:, np.newaxis], y[:, np.newaxis]))
                clf_multioutput = MultiTaskElasticNet(
                    alpha=0.5, max_iter=100, fit_intercept=fit_intercept,
                    normalize=normalize)
                clf_multioutput.fit(X, multi_y)
                coef[('multi', dtype)] = clf_multioutput.coef_
                intercept[('multi', dtype)] = clf_multioutput.intercept_
                assert_equal(clf.coef_.dtype, dtype)

            for v in ['simple', 'multi']:
                assert_array_almost_equal(coef[(v, np.float32)],
                                          coef[(v, np.float64)],
                                          decimal=4)
                assert_array_almost_equal(intercept[(v, np.float32)],
                                          intercept[(v, np.float64)],
                                          decimal=4)
def check_class_weight_balanced_and_bootstrap_multi_output(name):
    # Test class_weight works for multi-output"""
    ForestClassifier = FOREST_CLASSIFIERS[name]
    _y = np.vstack((y, np.array(y) * 2)).T
    clf = ForestClassifier(class_weight="balanced", random_state=0)
    clf.fit(X, _y)
    clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.0}, {-2: 1.0, 2: 1.0}], random_state=0)
    clf.fit(X, _y)
    # smoke test for subsample and balanced subsample
    clf = ForestClassifier(class_weight="balanced_subsample", random_state=0)
    clf.fit(X, _y)
    clf = ForestClassifier(class_weight="subsample", random_state=0)
    ignore_warnings(clf.fit)(X, _y)
def test_selectpercentile_tiebreaking():
    # Test if SelectPercentile selects the right n_features in case of ties.
    Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
    y = [1]
    dummy_score = lambda X, y: (X[0], X[0])
    for X in Xs:
        sel = SelectPercentile(dummy_score, percentile=34)
        X1 = ignore_warnings(sel.fit_transform)([X], y)
        assert_equal(X1.shape[1], 1)
        assert_best_scores_kept(sel)

        sel = SelectPercentile(dummy_score, percentile=67)
        X2 = ignore_warnings(sel.fit_transform)([X], y)
        assert_equal(X2.shape[1], 2)
        assert_best_scores_kept(sel)
def test_kneighbors():
    # Checks whether desired number of neighbors are returned.
    # It is guaranteed to return the requested number of neighbors
    # if `min_hash_match` is set to 0. Returned distances should be
    # in ascending order.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        min_hash_match=0)
    # Test unfitted estimator
    assert_raises(ValueError, lshf.kneighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)].reshape(1, -1)
        neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors,
                                    return_distance=False)
        # Desired number of neighbors should be returned.
        assert_equal(neighbors.shape[1], n_neighbors)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.kneighbors(queries,
                                           n_neighbors=1,
                                           return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # Test only neighbors
    neighbors = lshf.kneighbors(queries, n_neighbors=1,
                                return_distance=False)
    assert_equal(neighbors.shape[0], n_queries)
    # Test random point(not in the data set)
    query = rng.randn(n_features).reshape(1, -1)
    lshf.kneighbors(query, n_neighbors=1,
                    return_distance=False)
    # Test n_neighbors at initialization
    neighbors = lshf.kneighbors(query, return_distance=False)
    assert_equal(neighbors.shape[1], 5)
    # Test `neighbors` has an integer dtype
    assert_true(neighbors.dtype.kind == 'i',
                msg="neighbors are not in integer dtype.")
Exemple #24
0
    def test_train(self, params="wmc"):
        g = mixture.GMM(n_components=self.n_components, covariance_type=self.covariance_type)
        with ignore_warnings(category=DeprecationWarning):
            g.weights_ = self.weights
            g.means_ = self.means
            g.covars_ = 20 * self.covars[self.covariance_type]

        # Create a training set by sampling from the predefined distribution.
        with ignore_warnings(category=DeprecationWarning):
            X = g.sample(n_samples=100)
            g = self.model(
                n_components=self.n_components,
                covariance_type=self.covariance_type,
                random_state=rng,
                min_covar=1e-1,
                n_iter=1,
                init_params=params,
            )
            g.fit(X)

        # Do one training iteration at a time so we can keep track of
        # the log likelihood to make sure that it increases after each
        # iteration.
        trainll = []
        with ignore_warnings(category=DeprecationWarning):
            for _ in range(5):
                g.params = params
                g.init_params = ""
                g.fit(X)
                trainll.append(self.score(g, X))
            g.n_iter = 10
            g.init_params = ""
            g.params = params
            g.fit(X)  # finish fitting

        # Note that the log likelihood will sometimes decrease by a
        # very small amount after it has more or less converged due to
        # the addition of min_covar to the covariance (to prevent
        # underflow).  This is why the threshold is set to -0.5
        # instead of 0.
        with ignore_warnings(category=DeprecationWarning):
            delta_min = np.diff(trainll).min()
        self.assertTrue(
            delta_min > self.threshold,
            "The min nll increase is %f which is lower than the admissible"
            " threshold of %f, for model %s. The likelihoods are %s."
            % (delta_min, self.threshold, self.covariance_type, trainll),
        )
def test_multilabel_hamming_loss():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])

    assert_equal(hamming_loss(y1, y2), 1 / 6)
    assert_equal(hamming_loss(y1, y1), 0)
    assert_equal(hamming_loss(y2, y2), 0)
    assert_equal(hamming_loss(y2, np.logical_not(y2)), 1)
    assert_equal(hamming_loss(y1, np.logical_not(y1)), 1)
    assert_equal(hamming_loss(y1, np.zeros(y1.shape)), 4 / 6)
    assert_equal(hamming_loss(y2, np.zeros(y1.shape)), 0.5)

    with ignore_warnings():  # sequence of sequences is deprecated
        # List of tuple of label
        y1 = [(1, 2,), (0, 2,)]
        y2 = [(2,), (0, 2,)]

        assert_equal(hamming_loss(y1, y2), 1 / 6)
        assert_equal(hamming_loss(y1, y1), 0)
        assert_equal(hamming_loss(y2, y2), 0)
        assert_equal(hamming_loss(y2, [(), ()]), 0.75)
        assert_equal(hamming_loss(y1, [tuple(), (10, )]), 0.625)
        assert_almost_equal(hamming_loss(y2, [tuple(), (10, )],
                                         classes=np.arange(11)), 0.1818, 2)
def test_non_meta_estimators(name, Estimator, check):
    # Common tests for non-meta estimators
    with ignore_warnings(category=(DeprecationWarning, ConvergenceWarning,
                                   UserWarning, FutureWarning)):
        estimator = Estimator()
        set_checking_parameters(estimator)
        check(name, estimator)
def test_no_attributes_set_in_init(name, Estimator):
    # input validation etc for non-meta estimators
    with ignore_warnings(category=(DeprecationWarning, ConvergenceWarning,
                                   UserWarning, FutureWarning)):
        estimator = Estimator()
        # check this on class
        check_no_attributes_set_in_init(name, estimator)
Exemple #28
0
def test_1d_input(name):
    X = iris.data[:, 0]
    X_2d = iris.data[:, 0].reshape((-1, 1))
    y = iris.target

    with ignore_warnings():
        check_1d_input(name, X, X_2d, y)
def test_warm_start(solver, warm_start, fit_intercept, multi_class):
    # A 1-iteration second fit on same data should give almost same result
    # with warm starting, and quite different result without warm starting.
    # Warm starting does not work with liblinear solver.
    X, y = iris.data, iris.target

    clf = LogisticRegression(tol=1e-4, multi_class=multi_class,
                             warm_start=warm_start,
                             solver=solver,
                             random_state=42, max_iter=100,
                             fit_intercept=fit_intercept)
    with ignore_warnings(category=ConvergenceWarning):
        clf.fit(X, y)
        coef_1 = clf.coef_

        clf.max_iter = 1
        clf.fit(X, y)
    cum_diff = np.sum(np.abs(coef_1 - clf.coef_))
    msg = ("Warm starting issue with %s solver in %s mode "
           "with fit_intercept=%s and warm_start=%s"
           % (solver, multi_class, str(fit_intercept),
              str(warm_start)))
    if warm_start:
        assert_greater(2.0, cum_diff, msg)
    else:
        assert_greater(cum_diff, 2.0, msg)
def test_multilabel_accuracy_score_subset_accuracy():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])

    assert_equal(accuracy_score(y1, y2), 0.5)
    assert_equal(accuracy_score(y1, y1), 1)
    assert_equal(accuracy_score(y2, y2), 1)
    assert_equal(accuracy_score(y2, np.logical_not(y2)), 0)
    assert_equal(accuracy_score(y1, np.logical_not(y1)), 0)
    assert_equal(accuracy_score(y1, np.zeros(y1.shape)), 0)
    assert_equal(accuracy_score(y2, np.zeros(y1.shape)), 0)

    with ignore_warnings():  # sequence of sequences is deprecated
        # List of tuple of label
        y1 = [(1, 2,), (0, 2,)]
        y2 = [(2,), (0, 2,)]

        assert_equal(accuracy_score(y1, y2), 0.5)
        assert_equal(accuracy_score(y1, y1), 1)
        assert_equal(accuracy_score(y2, y2), 1)
        assert_equal(accuracy_score(y2, [(), ()]), 0)
        assert_equal(accuracy_score(y1, y2, normalize=False), 1)
        assert_equal(accuracy_score(y1, y1, normalize=False), 2)
        assert_equal(accuracy_score(y2, y2, normalize=False), 2)
        assert_equal(accuracy_score(y2, [(), ()], normalize=False), 0)
def test_warm_start():
    X, y, _, _ = build_dataset()
    clf = ElasticNet(alpha=0.1, max_iter=5, warm_start=True)
    ignore_warnings(clf.fit)(X, y)
    ignore_warnings(clf.fit)(X, y)  # do a second round with 5 iterations

    clf2 = ElasticNet(alpha=0.1, max_iter=10)
    ignore_warnings(clf2.fit)(X, y)
    assert_array_almost_equal(clf2.coef_, clf.coef_)
Exemple #32
0
def test_lasso_lars_vs_lasso_cd_ill_conditioned():
    # Test lasso lars on a very ill-conditioned design, and check that
    # it does not blow up, and stays somewhat close to a solution given
    # by the coordinate descent solver
    # Also test that lasso_path (using lars_path output style) gives
    # the same result as lars_path and previous lasso output style
    # under these conditions.
    rng = np.random.RandomState(42)

    # Generate data
    n, m = 70, 100
    k = 5
    X = rng.randn(n, m)
    w = np.zeros((m, 1))
    i = np.arange(0, m)
    rng.shuffle(i)
    supp = i[:k]
    w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1)
    y = np.dot(X, w)
    sigma = 0.2
    y += sigma * rng.rand(*y.shape)
    y = y.squeeze()

    f = assert_warns_message

    def in_warn_message(msg):
        return 'Early stopping' in msg or 'Dropping a regressor' in msg
    lars_alphas, _, lars_coef = f(ConvergenceWarning,
                                  in_warn_message,
                                  linear_model.lars_path, X, y, method='lasso')

    with ignore_warnings():
        _, lasso_coef2, _ = linear_model.lasso_path(X, y,
                                                    alphas=lars_alphas,
                                                    tol=1e-6,
                                                    fit_intercept=False)

        lasso_coef = np.zeros((w.shape[0], len(lars_alphas)))
        iter_models = enumerate(linear_model.lasso_path(X, y,
                                                        alphas=lars_alphas,
                                                        tol=1e-6,
                                                        return_models=True,
                                                        fit_intercept=False))
        for i, model in iter_models:
            lasso_coef[:, i] = model.coef_

    np.testing.assert_array_almost_equal(lars_coef, lasso_coef, decimal=1)
    np.testing.assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1)
    np.testing.assert_array_almost_equal(lasso_coef, lasso_coef2, decimal=1)
Exemple #33
0
def test_classification_invariance_string_vs_numbers_labels(name):
    # Ensure that classification metrics with string labels are invariant
    random_state = check_random_state(0)
    y1 = random_state.randint(0, 2, size=(20, ))
    y2 = random_state.randint(0, 2, size=(20, ))

    y1_str = np.array(["eggs", "spam"])[y1]
    y2_str = np.array(["eggs", "spam"])[y2]

    pos_label_str = "spam"
    labels_str = ["eggs", "spam"]

    with ignore_warnings():
        metric = CLASSIFICATION_METRICS[name]
        measure_with_number = metric(y1, y2)

        # Ugly, but handle case with a pos_label and label
        metric_str = metric
        if name in METRICS_WITH_POS_LABEL:
            metric_str = partial(metric_str, pos_label=pos_label_str)

        measure_with_str = metric_str(y1_str, y2_str)

        assert_array_equal(measure_with_number,
                           measure_with_str,
                           err_msg="{0} failed string vs number invariance "
                           "test".format(name))

        measure_with_strobj = metric_str(y1_str.astype('O'),
                                         y2_str.astype('O'))
        assert_array_equal(measure_with_number,
                           measure_with_strobj,
                           err_msg="{0} failed string object vs number "
                           "invariance test".format(name))

        if name in METRICS_WITH_LABELS:
            metric_str = partial(metric_str, labels=labels_str)
            measure_with_str = metric_str(y1_str, y2_str)
            assert_array_equal(measure_with_number,
                               measure_with_str,
                               err_msg="{0} failed string vs number  "
                               "invariance test".format(name))

            measure_with_strobj = metric_str(y1_str.astype('O'),
                                             y2_str.astype('O'))
            assert_array_equal(measure_with_number,
                               measure_with_strobj,
                               err_msg="{0} failed string vs number  "
                               "invariance test".format(name))
Exemple #34
0
def test_get_params_invariance():
    # Test for estimators that support get_params, that
    # get_params(deep=False) is a subset of get_params(deep=True)
    # Related to issue #4465

    estimators = all_estimators(include_meta_estimators=False,
                                include_other=True)
    for name, Estimator in estimators:
        if hasattr(Estimator, 'get_params'):
            # If class is deprecated, ignore deprecated warnings
            if hasattr(Estimator.__init__, "deprecated_original"):
                with ignore_warnings():
                    yield check_get_params_invariance, name, Estimator
            else:
                yield check_get_params_invariance, name, Estimator
Exemple #35
0
def test_roc_curve():
    """Test Area under Receiver Operating Characteristic (ROC) curve"""
    y_true, _, probas_pred = make_prediction(binary=True)

    fpr, tpr, thresholds = roc_curve(y_true, probas_pred)
    roc_auc = auc(fpr, tpr)
    expected_auc = _auc(y_true, probas_pred)
    assert_array_almost_equal(roc_auc, expected_auc, decimal=2)
    assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred))

    assert_almost_equal(roc_auc,
                        ignore_warnings(auc_score)(y_true, probas_pred))

    assert_equal(fpr.shape, tpr.shape)
    assert_equal(fpr.shape, thresholds.shape)
Exemple #36
0
def test_verbose_sgd():
    # Test verbose.
    X = [[3, 2], [1, 6]]
    y = [1, 0]
    clf = MLPClassifier(algorithm='sgd', max_iter=2, verbose=10,
                        hidden_layer_sizes=2)
    old_stdout = sys.stdout
    sys.stdout = output = StringIO()

    with ignore_warnings(category=ConvergenceWarning):
        clf.fit(X, y)
    clf.partial_fit(X, y)

    sys.stdout = old_stdout
    assert 'Iteration' in output.getvalue()
Exemple #37
0
def test_one_hot_encoder_dense():
    # check for sparse=False
    X = [[3, 2, 1], [0, 1, 1]]
    enc = OneHotEncoder(sparse=False)
    with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
        # discover max values automatically
        X_trans = enc.fit_transform(X)
        assert_equal(X_trans.shape, (2, 5))
        assert_array_equal(enc.active_features_,
                           np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0])
        assert_array_equal(enc.feature_indices_, [0, 4, 7, 9])

    # check outcome
    assert_array_equal(X_trans,
                       np.array([[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]))
def test_non_meta_estimators():
    # input validation etc for non-meta estimators
    estimators = all_estimators()
    for name, Estimator in estimators:
        if issubclass(Estimator, BiclusterMixin):
            continue
        if name.startswith("_"):
            continue
        for check in _yield_all_checks(name, Estimator):
            if issubclass(Estimator, ProjectedGradientNMF):
                # The ProjectedGradientNMF class is deprecated
                with ignore_warnings():
                    yield check, name, Estimator
            else:
                yield check, name, Estimator
def test_get_params_invariance():
    # Test for estimators that support get_params, that
    # get_params(deep=False) is a subset of get_params(deep=True)
    # Related to issue #4465

    estimators = all_estimators(include_meta_estimators=False,
                                include_other=True)
    for name, Estimator in estimators:
        if hasattr(Estimator, 'get_params'):
            # The ProjectedGradientNMF class is deprecated
            if issubclass(Estimator, ProjectedGradientNMF):
                with ignore_warnings():
                    yield check_get_params_invariance, name, Estimator
            else:
                yield check_get_params_invariance, name, Estimator
def test_ovr_exceptions():
    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
    assert_raises(ValueError, ovr.predict, [])

    with ignore_warnings():
        assert_raises(ValueError, predict_ovr, [LinearSVC(), MultinomialNB()],
                      LabelBinarizer(), [])

    # Fail on multioutput data
    assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit,
                  np.array([[1, 0], [0, 1]]),
                  np.array([[1, 2], [3, 1]]))
    assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit,
                  np.array([[1, 0], [0, 1]]),
                  np.array([[1.5, 2.4], [3.1, 0.8]]))
def test_neighbors_accuracy_with_n_candidates():
    # Checks whether accuracy increases as `n_candidates` increases.
    n_candidates_values = np.array([.1, 50, 500])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_candidates_values.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, n_candidates in enumerate(n_candidates_values):
        lshf = ignore_warnings(
            LSHForest, category=DeprecationWarning)(n_candidates=n_candidates)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)

            neighbors = lshf.kneighbors(query,
                                        n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    print('accuracies:', accuracies)
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")
def test_iforest():
    """Check Isolation Forest for various parameter settings."""
    X_train = np.array([[0, 1], [1, 2]])
    X_test = np.array([[2, 1], [1, 1]])

    grid = ParameterGrid({
        "n_estimators": [3],
        "max_samples": [0.5, 1.0, 3],
        "bootstrap": [True, False]
    })

    with ignore_warnings():
        for params in grid:
            IsolationForest(random_state=rng,
                            **params).fit(X_train).predict(X_test)
def test_candidates():
    # Checks whether candidates are sufficient.
    # This should handle the cases when number of candidates is 0.
    # User should be warned when number of candidates is less than
    # requested number of neighbors.
    X_train = np.array([[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1],
                        [6, 10, 2]], dtype=np.float32)
    X_test = np.array([7, 10, 3], dtype=np.float32).reshape(1, -1)

    # For zero candidates
    lshf = LSHForest(min_hash_match=32)
    ignore_warnings(lshf.fit)(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (3, 32))
    assert_warns_message(UserWarning, message, lshf.kneighbors,
                         X_test, n_neighbors=3)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3)
    assert_equal(distances.shape[1], 3)

    # For candidates less than n_neighbors
    lshf = LSHForest(min_hash_match=31)
    ignore_warnings(lshf.fit)(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (5, 31))
    assert_warns_message(UserWarning, message, lshf.kneighbors,
                         X_test, n_neighbors=5)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5)
    assert_equal(distances.shape[1], 5)
Exemple #44
0
def test_1d_1component():
    # Test all of the covariance_types return the same BIC score for
    # 1-dimensional, 1 component fits.
    n_samples, n_dim, n_components = 100, 1, 1
    X = rng.randn(n_samples, n_dim)
    g_full = mixture.GMM(n_components=n_components, covariance_type='full',
                         random_state=rng, min_covar=1e-7, n_iter=1)
    with ignore_warnings(category=DeprecationWarning):
        g_full.fit(X)
        g_full_bic = g_full.bic(X)
        for cv_type in ['tied', 'diag', 'spherical']:
            g = mixture.GMM(n_components=n_components, covariance_type=cv_type,
                            random_state=rng, min_covar=1e-7, n_iter=1)
            g.fit(X)
            assert_array_almost_equal(g.bic(X), g_full_bic)
Exemple #45
0
def test_sparse_matrices():
    # Test that sparse and dense input matrices output the same results."""
    X = X_digits_binary[:50]
    y = y_digits_binary[:50]
    X_sparse = csr_matrix(X)
    mlp = MLPClassifier(random_state=1, hidden_layer_sizes=15)
    with ignore_warnings(category=ConvergenceWarning):
        mlp.fit(X, y)
        pred1 = mlp.decision_function(X)
        mlp.fit(X_sparse, y)
        pred2 = mlp.decision_function(X_sparse)
    assert_almost_equal(pred1, pred2)
    pred1 = mlp.predict(X)
    pred2 = mlp.predict(X_sparse)
    assert_array_equal(pred1, pred2)
def test_warm_start_multitask_lasso():
    X, y, X_test, y_test = build_dataset()
    Y = np.c_[y, y]
    clf = MultiTaskLasso(alpha=0.1, max_iter=5, warm_start=True)
    ignore_warnings(clf.fit)(X, Y)
    ignore_warnings(clf.fit)(X, Y)  # do a second round with 5 iterations

    clf2 = MultiTaskLasso(alpha=0.1, max_iter=10)
    ignore_warnings(clf2.fit)(X, Y)
    assert_array_almost_equal(clf2.coef_, clf.coef_)
Exemple #47
0
def test_neighbors_badargs():
    """Test bad argument values: these should all raise ValueErrors"""
    assert_raises(ValueError,
                  neighbors.NearestNeighbors,
                  algorithm='blah')

    X = rng.random_sample((10, 2))
    Xsparse = csr_matrix(X)
    y = np.ones(10)

    for cls in (neighbors.KNeighborsClassifier,
                neighbors.RadiusNeighborsClassifier,
                neighbors.KNeighborsRegressor,
                neighbors.RadiusNeighborsRegressor):
        assert_raises(ValueError,
                      cls,
                      weights='blah')
        assert_raises(ValueError,
                      cls, p=-1)
        assert_raises(ValueError,
                      cls, algorithm='blah')
        nbrs = cls(algorithm='ball_tree', metric='haversine')
        assert_raises(ValueError,
                      nbrs.predict,
                      X)
        assert_raises(ValueError,
                      ignore_warnings(nbrs.fit),
                      Xsparse, y)
        nbrs = cls()
        assert_raises(ValueError,
                      nbrs.fit,
                      np.ones((0, 2)), np.ones(0))
        assert_raises(ValueError,
                      nbrs.fit,
                      X[:, :, None], y)
        nbrs.fit(X, y)
        assert_raises(ValueError,
                      nbrs.predict,
                      [])

    nbrs = neighbors.NearestNeighbors().fit(X)

    assert_raises(ValueError,
                  nbrs.kneighbors_graph,
                  X, mode='blah')
    assert_raises(ValueError,
                  nbrs.radius_neighbors_graph,
                  X, mode='blah')
Exemple #48
0
def do_generate_rf_optimazed_model(X_train, y_train, parameters):
    file_operations.write_logs(
        FILENAME, 'Starting RF Grid Search with parameters:' + str(parameters))
    model = RandomForestClassifier(random_state=my_constants.RANDOM_VALUE,
                                   oob_score=True)
    model_grid = GridSearchCV(model,
                              param_grid=parameters,
                              cv=3,
                              verbose=3,
                              n_jobs=3)
    with ignore_warnings(category=ConvergenceWarning):
        model_grid.fit(X_train, y_train)

    file_operations.write_logs(FILENAME, 'RF Grid search completed')

    return model_grid
def test_enet_path():
    # We use a large number of samples and of informative features so that
    # the l1_ratio selected is more toward ridge than lasso
    X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100,
                                         n_informative_features=100)
    max_iter = 150

    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests
    clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3,
                       l1_ratio=[0.5, 0.7], cv=3,
                       max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have selected an elastic-net
    # that is closer to ridge than to lasso
    assert_equal(clf.l1_ratio_, min(clf.l1_ratio))

    clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3,
                       l1_ratio=[0.5, 0.7], cv=3,
                       max_iter=max_iter, precompute=True)
    ignore_warnings(clf.fit)(X, y)

    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have selected an elastic-net
    # that is closer to ridge than to lasso
    assert_equal(clf.l1_ratio_, min(clf.l1_ratio))

    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert_greater(clf.score(X_test, y_test), 0.99)

    # Multi-output/target case
    X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3)
    clf = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7],
                                cv=3, max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert_greater(clf.score(X_test, y_test), 0.99)
    assert_equal(clf.coef_.shape, (3, 10))

    # Mono-output should have same cross-validated alpha_ and l1_ratio_
    # in both cases.
    X, y, _, _ = build_dataset(n_features=10)
    clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf1.fit(X, y)
    clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf2.fit(X, y[:, np.newaxis])
    assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_)
    assert_almost_equal(clf1.alpha_, clf2.alpha_)
Exemple #50
0
 def test_train_degenerate(self, params='wmc'):
     # Train on degenerate data with 0 in some dimensions
     # Create a training set by sampling from the predefined
     # distribution.
     X = rng.randn(100, self.n_features)
     X.T[1:] = 0
     g = self.model(n_components=2,
                    covariance_type=self.covariance_type,
                    random_state=rng,
                    min_covar=1e-3,
                    n_iter=5,
                    init_params=params)
     with ignore_warnings(category=DeprecationWarning):
         g.fit(X)
         trainll = g.score(X)
     self.assertTrue(np.sum(np.abs(trainll / 100 / X.shape[1])) < 5)
Exemple #51
0
def test_multilabel_jaccard_similarity_score():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])

    # size(y1 \inter y2) = [1, 2]
    # size(y1 \union y2) = [2, 2]

    assert_equal(jaccard_similarity_score(y1, y2), 0.75)
    assert_equal(jaccard_similarity_score(y1, y1), 1)
    assert_equal(jaccard_similarity_score(y2, y2), 1)
    assert_equal(jaccard_similarity_score(y2, np.logical_not(y2)), 0)
    assert_equal(jaccard_similarity_score(y1, np.logical_not(y1)), 0)
    assert_equal(jaccard_similarity_score(y1, np.zeros(y1.shape)), 0)
    assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape)), 0)

    with ignore_warnings():  # sequence of sequences is deprecated
        # List of tuple of label
        y1 = [(
            1,
            2,
        ), (
            0,
            2,
        )]
        y2 = [(2, ), (
            0,
            2,
        )]

        assert_equal(jaccard_similarity_score(y1, y2), 0.75)
        assert_equal(jaccard_similarity_score(y1, y1), 1)
        assert_equal(jaccard_similarity_score(y2, y2), 1)
        assert_equal(jaccard_similarity_score(y2, [(), ()]), 0)

        # |y3 inter y4 | = [0, 1, 1]
        # |y3 union y4 | = [2, 1, 3]
        y3 = [(0, ), (1, ), (3, )]
        y4 = [(4, ), (4, ), (5, 6)]
        assert_almost_equal(jaccard_similarity_score(y3, y4), 0)

        # |y5 inter y6 | = [0, 1, 1]
        # |y5 union y6 | = [2, 1, 3]
        y5 = [(0, ), (1, ), (2, 3)]
        y6 = [(1, ), (1, ), (2, 0)]

        assert_almost_equal(jaccard_similarity_score(y5, y6), (1 + 1 / 3) / 3)
def test_multiclass_multioutput(Estimator):
    # Make sure error is raised for multiclass-multioutput classifiers

    # make multiclass-multioutput dataset
    X, y = make_classification(n_classes=3,
                               n_clusters_per_class=1,
                               random_state=0)
    y = np.array([y, y]).T

    est = Estimator()
    with ignore_warnings(category=FutureWarning):
        est.fit(X, y)

    with pytest.raises(
            ValueError,
            match="Multiclass-multioutput estimators are not supported"):
        partial_dependence(est, X, [0])
Exemple #53
0
 def train(self, contexts, responses):
     """Fit the tf-idf transform and compute idf statistics."""
     with ignore_warnings():
         # Ignore deprecated `non_negative` warning.
         self._vectorizer = HashingVectorizer(non_negative=True)
     self._tfidf_transform = TfidfTransformer()
     count_matrix = self._tfidf_transform.fit_transform(
         self._vectorizer.transform(contexts + responses))
     n_samples, n_features = count_matrix.shape
     df = _document_frequency(count_matrix)
     idf = np.log((n_samples - df + 0.5) / (df + 0.5))
     self._idf_diag = sp.spdiags(
         idf, diags=0, m=n_features, n=n_features
     )
     document_lengths = count_matrix.sum(axis=1)
     self._average_document_length = np.mean(document_lengths)
     print(self._average_document_length)
Exemple #54
0
 def test_train_1d(self, params='wmc'):
     # Train on 1-D data
     # Create a training set by sampling from the predefined
     # distribution.
     X = rng.randn(100, 1)
     # X.T[1:] = 0
     g = self.model(n_components=2,
                    covariance_type=self.covariance_type,
                    random_state=rng, min_covar=1e-7, n_iter=5,
                    init_params=params)
     with ignore_warnings(category=DeprecationWarning):
         g.fit(X)
         trainll = g.score(X)
         if isinstance(g, mixture.DPGMM):
             self.assertTrue(np.sum(np.abs(trainll / 100)) < 5)
         else:
             self.assertTrue(np.sum(np.abs(trainll / 100)) < 2)
def test_logistic_regression_sample_weights():
    X, y = make_classification(n_samples=20, n_features=5, n_informative=3,
                               n_classes=2, random_state=0)
    sample_weight = y + 1

    for LR in [LogisticRegression, LogisticRegressionCV]:
        # Test that liblinear fails when sample weights are provided
        clf_lib = LR(solver='liblinear')
        assert_raises(ValueError, clf_lib.fit, X, y,
                      sample_weight=np.ones(y.shape[0]))

        # Test that passing sample_weight as ones is the same as
        # not passing them at all (default None)
        clf_sw_none = LR(solver='lbfgs', fit_intercept=False)
        clf_sw_none.fit(X, y)
        clf_sw_ones = LR(solver='lbfgs', fit_intercept=False)
        clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0]))
        assert_array_almost_equal(clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4)

        # Test that sample weights work the same with the lbfgs,
        # newton-cg, and 'sag' solvers
        clf_sw_lbfgs = LR(solver='lbfgs', fit_intercept=False)
        clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight)
        clf_sw_n = LR(solver='newton-cg', fit_intercept=False)
        clf_sw_n.fit(X, y, sample_weight=sample_weight)
        clf_sw_sag = LR(solver='sag', fit_intercept=False, tol=1e-10)
        # ignore convergence warning due to small dataset
        with ignore_warnings():
            clf_sw_sag.fit(X, y, sample_weight=sample_weight)
        assert_array_almost_equal(
            clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4)
        assert_array_almost_equal(
            clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4)

        # Test that passing class_weight as [1,2] is the same as
        # passing class weight = [1,1] but adjusting sample weights
        # to be 2 for all instances of class 2
        clf_cw_12 = LR(solver='lbfgs', fit_intercept=False,
                       class_weight={0: 1, 1: 2})
        clf_cw_12.fit(X, y)
        sample_weight = np.ones(y.shape[0])
        sample_weight[y == 1] = 2
        clf_sw_12 = LR(solver='lbfgs', fit_intercept=False)
        clf_sw_12.fit(X, y, sample_weight=sample_weight)
        assert_array_almost_equal(clf_cw_12.coef_, clf_sw_12.coef_, decimal=4)
Exemple #56
0
def test_alpha():
    # Test that larger alpha yields weights closer to zero"""
    X = X_digits_binary[:100]
    y = y_digits_binary[:100]

    alpha_vectors = []
    alpha_values = np.arange(2)
    absolute_sum = lambda x: np.sum(np.abs(x))

    for alpha in alpha_values:
        mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1)
        with ignore_warnings(category=ConvergenceWarning):
            mlp.fit(X, y)
        alpha_vectors.append(np.array([absolute_sum(mlp.coefs_[0]),
                                       absolute_sum(mlp.coefs_[1])]))

    for i in range(len(alpha_values) - 1):
        assert (alpha_vectors[i] > alpha_vectors[i + 1]).all()
Exemple #57
0
    def _compute_score(self, X, y=None):
        _, n_features = X.shape

        score = np.empty(len(self.n_clusters_range))
        for k, n_clusters in enumerate(self.n_clusters_range):
            self.estimator.set_params(n_clusters=n_clusters)
            score[k] = self.estimator.fit(X).score(X)

        weights = 1. - np.exp((self.n_clusters_range[:-1] - 1) *
                              np.log(5. / 6.) + np.log(.75) -
                              np.log(n_features))

        with ignore_warnings(category=RuntimeWarning):
            score[1:] = np.where(score[:-1] != 0.,
                                 score[1:] / (weights * score[:-1]), 1.)
        score[0] = 1.

        return -score
Exemple #58
0
def train_model(folds, model):
    """
    Evaluation with:
      Matthews correlation coefficient: represents thresholding measures
      AUC: represents ranking measures
      Brier score: represents calibration measures
    """
    scores = []
    fit_model_time = 0  # Sum of all the time spend on fitting the training data, later on normalized
    score_model_time = 0  # Sum of all the time spend on scoring the testing data, later on normalized

    for X_train, y_train, X_test, y_test in folds:
        # Training
        start_time = time.time()
        with ignore_warnings(category=ConvergenceWarning
                             ):  # Yes, neural networks do not always converge
            model.fit(X_train, y_train)
        fit_model_time += time.time() - start_time
        prediction_train_proba = model.predict_proba(X_train)[:, 1]
        prediction_train = (prediction_train_proba >= 0.5).astype('uint8')

        # Testing
        start_time = time.time()
        prediction_test_proba = model.predict_proba(X_test)[:, 1]
        score_model_time += time.time() - start_time
        prediction_test = (prediction_test_proba >= 0.5).astype('uint8')

        # When all the predictions are of a single class, we get a RuntimeWarning in matthews_corr
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            scores.append([
                sklearn.metrics.matthews_corrcoef(y_test, prediction_test),
                sklearn.metrics.matthews_corrcoef(y_train, prediction_train),
                sklearn.metrics.roc_auc_score(y_test, prediction_test_proba),
                sklearn.metrics.roc_auc_score(y_train, prediction_train_proba),
                sklearn.metrics.brier_score_loss(y_test,
                                                 prediction_test_proba),
                sklearn.metrics.brier_score_loss(y_train,
                                                 prediction_train_proba)
            ])

    return np.mean(
        scores,
        axis=0), fit_model_time / len(folds), score_model_time / len(folds)
Exemple #59
0
def test_same_output_sparse_dense_lasso_and_enet_cv():
    X, y = make_sparse_data(n_samples=40, n_features=10)
    for normalize in [True, False]:
        clfs = ElasticNetCV(max_iter=100, cv=5, normalize=normalize)
        ignore_warnings(clfs.fit)(X, y)
        clfd = ElasticNetCV(max_iter=100, cv=5, normalize=normalize)
        ignore_warnings(clfd.fit)(X.toarray(), y)
        assert_almost_equal(clfs.alpha_, clfd.alpha_, 7)
        assert_almost_equal(clfs.intercept_, clfd.intercept_, 7)
        assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_)
        assert_array_almost_equal(clfs.alphas_, clfd.alphas_)

        clfs = LassoCV(max_iter=100, cv=4, normalize=normalize)
        ignore_warnings(clfs.fit)(X, y)
        clfd = LassoCV(max_iter=100, cv=4, normalize=normalize)
        ignore_warnings(clfd.fit)(X.toarray(), y)
        assert_almost_equal(clfs.alpha_, clfd.alpha_, 7)
        assert_almost_equal(clfs.intercept_, clfd.intercept_, 7)
        assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_)
        assert_array_almost_equal(clfs.alphas_, clfd.alphas_)
Exemple #60
0
def test_scorer_memmap_input(name):
    # Non-regression test for #6147: some score functions would
    # return singleton memmap when computed on memmap data instead of scalar
    # float values.

    if name in REQUIRE_POSITIVE_Y_SCORERS:
        y_mm_1 = _require_positive_y(y_mm)
        y_ml_mm_1 = _require_positive_y(y_ml_mm)
    else:
        y_mm_1, y_ml_mm_1 = y_mm, y_ml_mm

    # UndefinedMetricWarning for P / R scores
    with ignore_warnings():
        scorer, estimator = SCORERS[name], ESTIMATORS[name]
        if name in MULTILABEL_ONLY_SCORERS:
            score = scorer(estimator, X_mm, y_ml_mm_1)
        else:
            score = scorer(estimator, X_mm, y_mm_1)
        assert isinstance(score, numbers.Number), name