Example #1
0
def test_prefit():
    # Test all possible combinations of the prefit parameter.

    # Passing a prefit parameter with the selected model
    # and fitting a unfit model with prefit=False should give same results.
    clf = SGDClassifier(alpha=0.1,
                        max_iter=10,
                        shuffle=True,
                        random_state=0,
                        tol=None)
    model = SelectFromModel(clf)
    model.fit(data, y)
    X_transform = model.transform(data)
    clf.fit(data, y)
    model = SelectFromModel(clf, prefit=True)
    assert_array_almost_equal(model.transform(data), X_transform)

    # Check that the model is rewritten if prefit=False and a fitted model is
    # passed
    model = SelectFromModel(clf, prefit=False)
    model.fit(data, y)
    assert_array_almost_equal(model.transform(data), X_transform)

    # Check that prefit=True and calling fit raises a ValueError
    model = SelectFromModel(clf, prefit=True)
    with pytest.raises(ValueError):
        model.fit(data, y)
Example #2
0
def test_multinomial_loss():
    # test if the multinomial loss and gradient computations are consistent
    X, y = iris.data, iris.target.astype(np.float64)
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))

    rng = check_random_state(42)
    weights = rng.randn(n_features, n_classes)
    intercept = rng.randn(n_classes)
    sample_weights = rng.randn(n_samples)
    np.abs(sample_weights, sample_weights)

    # compute loss and gradient like in multinomial SAG
    dataset, _ = make_dataset(X, y, sample_weights, random_state=42)
    loss_1, grad_1 = _multinomial_grad_loss_all_samples(
        dataset, weights, intercept, n_samples, n_features, n_classes)
    # compute loss and gradient like in multinomial LogisticRegression
    lbin = LabelBinarizer()
    Y_bin = lbin.fit_transform(y)
    weights_intercept = np.vstack((weights, intercept)).T.ravel()
    loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin,
                                               0.0, sample_weights)
    grad_2 = grad_2.reshape(n_classes, -1)
    grad_2 = grad_2[:, :-1].T

    # comparison
    assert_array_almost_equal(grad_1, grad_2)
    assert_almost_equal(loss_1, loss_2)
Example #3
0
def test_compute_class_weight_invariance():
    # Test that results with class_weight="balanced" is invariant wrt
    # class imbalance if the number of samples is identical.
    # The test uses a balanced two class dataset with 100 datapoints.
    # It creates three versions, one where class 1 is duplicated
    # resulting in 150 points of class 1 and 50 of class 0,
    # one where there are 50 points in class 1 and 150 in class 0,
    # and one where there are 100 points of each class (this one is balanced
    # again).
    # With balancing class weights, all three should give the same model.
    X, y = make_blobs(centers=2, random_state=0)
    # create dataset where class 1 is duplicated twice
    X_1 = np.vstack([X] + [X[y == 1]] * 2)
    y_1 = np.hstack([y] + [y[y == 1]] * 2)
    # create dataset where class 0 is duplicated twice
    X_0 = np.vstack([X] + [X[y == 0]] * 2)
    y_0 = np.hstack([y] + [y[y == 0]] * 2)
    # duplicate everything
    X_ = np.vstack([X] * 2)
    y_ = np.hstack([y] * 2)
    # results should be identical
    logreg1 = LogisticRegression(class_weight="balanced").fit(X_1, y_1)
    logreg0 = LogisticRegression(class_weight="balanced").fit(X_0, y_0)
    logreg = LogisticRegression(class_weight="balanced").fit(X_, y_)
    assert_array_almost_equal(logreg1.coef_, logreg0.coef_)
    assert_array_almost_equal(logreg.coef_, logreg0.coef_)
Example #4
0
def test_leave_zero_eig():
    """This test checks that fit().transform() returns the same result as
    fit_transform() in case of non-removed zero eigenvalue.
    Non-regression test for issue #12141 (PR #12143)"""
    X_fit = np.array([[1, 1], [0, 0]])

    # Assert that even with all np warnings on, there is no div by zero warning
    with pytest.warns(None) as record:
        with np.errstate(all='warn'):
            k = KernelPCA(n_components=2,
                          remove_zero_eig=False,
                          eigen_solver="dense")
            # Fit, then transform
            A = k.fit(X_fit).transform(X_fit)
            # Do both at once
            B = k.fit_transform(X_fit)
            # Compare
            assert_array_almost_equal(np.abs(A), np.abs(B))

    for w in record:
        # There might be warnings about the kernel being badly conditioned,
        # but there should not be warnings about division by zero.
        # (Numpy division by zero warning can have many message variants, but
        # at least we know that it is a RuntimeWarning so lets check only this)
        assert not issubclass(w.category, RuntimeWarning)
Example #5
0
def test_nystroem_approximation():
    # some basic tests
    rnd = np.random.RandomState(0)
    X = rnd.uniform(size=(10, 4))

    # With n_components = n_samples this is exact
    X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
    K = rbf_kernel(X)
    assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)

    trans = Nystroem(n_components=2, random_state=rnd)
    X_transformed = trans.fit(X).transform(X)
    assert X_transformed.shape == (X.shape[0], 2)

    # test callable kernel
    def linear_kernel(X, Y):
        return np.dot(X, Y.T)

    trans = Nystroem(n_components=2, kernel=linear_kernel, random_state=rnd)
    X_transformed = trans.fit(X).transform(X)
    assert X_transformed.shape == (X.shape[0], 2)

    # test that available kernels fit and transform
    kernels_available = kernel_metrics()
    for kern in kernels_available:
        trans = Nystroem(n_components=2, kernel=kern, random_state=rnd)
        X_transformed = trans.fit(X).transform(X)
        assert X_transformed.shape == (X.shape[0], 2)
Example #6
0
def test_isotonic_regression_ties_secondary_():
    """
    Test isotonic regression fit, transform  and fit_transform
    against the "secondary" ties method and "pituitary" data from R
     "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
     Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
    (PAVA) and Active Set Methods

    Set values based on pituitary example and
     the following R command detailed in the paper above:
    > library("isotone")
    > data("pituitary")
    > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
    > res1$x

    `isotone` version: 1.0-2, 2014-09-07
    R version: R version 3.1.1 (2014-07-10)
    """
    x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
    y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
    y_true = [
        22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222,
        22.22222, 22.22222, 24.25, 24.25
    ]

    # Check fit, transform and fit_transform
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true, 4)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
Example #7
0
def test_feature_agglomeration():
    n_clusters = 1
    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)

    agglo_mean = FeatureAgglomeration(n_clusters=n_clusters,
                                      pooling_func=np.mean)
    agglo_median = FeatureAgglomeration(n_clusters=n_clusters,
                                        pooling_func=np.median)
    assert_no_warnings(agglo_mean.fit, X)
    assert_no_warnings(agglo_median.fit, X)
    assert np.size(np.unique(agglo_mean.labels_)) == n_clusters
    assert np.size(np.unique(agglo_median.labels_)) == n_clusters
    assert np.size(agglo_mean.labels_) == X.shape[1]
    assert np.size(agglo_median.labels_) == X.shape[1]

    # Test transform
    Xt_mean = agglo_mean.transform(X)
    Xt_median = agglo_median.transform(X)
    assert Xt_mean.shape[1] == n_clusters
    assert Xt_median.shape[1] == n_clusters
    assert Xt_mean == np.array([1 / 3.])
    assert Xt_median == np.array([0.])

    # Test inverse transform
    X_full_mean = agglo_mean.inverse_transform(Xt_mean)
    X_full_median = agglo_median.inverse_transform(Xt_median)
    assert np.unique(X_full_mean[0]).size == n_clusters
    assert np.unique(X_full_median[0]).size == n_clusters

    assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean)
    assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
Example #8
0
def test_make_s_curve():
    X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0)

    assert X.shape == (5, 3), "X shape mismatch"
    assert t.shape == (5, ), "t shape mismatch"
    assert_array_almost_equal(X[:, 0], np.sin(t))
    assert_array_almost_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
Example #9
0
def test_make_checkerboard():
    X, rows, cols = make_checkerboard(shape=(100, 100),
                                      n_clusters=(20, 5),
                                      shuffle=True,
                                      random_state=0)
    assert X.shape == (100, 100), "X shape mismatch"
    assert rows.shape == (100, 100), "rows shape mismatch"
    assert cols.shape == (
        100,
        100,
    ), "columns shape mismatch"

    X, rows, cols = make_checkerboard(shape=(100, 100),
                                      n_clusters=2,
                                      shuffle=True,
                                      random_state=0)
    assert_all_finite(X)
    assert_all_finite(rows)
    assert_all_finite(cols)

    X1, _, _ = make_checkerboard(shape=(100, 100),
                                 n_clusters=2,
                                 shuffle=True,
                                 random_state=0)
    X2, _, _ = make_checkerboard(shape=(100, 100),
                                 n_clusters=2,
                                 shuffle=True,
                                 random_state=0)
    assert_array_almost_equal(X1, X2)
Example #10
0
def test_make_multilabel_classification_return_indicator():
    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
        X, Y = make_multilabel_classification(n_samples=25,
                                              n_features=20,
                                              n_classes=3,
                                              random_state=0,
                                              allow_unlabeled=allow_unlabeled)
        assert X.shape == (25, 20), "X shape mismatch"
        assert Y.shape == (25, 3), "Y shape mismatch"
        assert np.all(np.sum(Y, axis=0) > min_length)

    # Also test return_distributions and return_indicator with True
    X2, Y2, p_c, p_w_c = make_multilabel_classification(
        n_samples=25,
        n_features=20,
        n_classes=3,
        random_state=0,
        allow_unlabeled=allow_unlabeled,
        return_distributions=True)

    assert_array_almost_equal(X, X2)
    assert_array_equal(Y, Y2)
    assert p_c.shape == (3, )
    assert_almost_equal(p_c.sum(), 1)
    assert p_w_c.shape == (20, 3)
    assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
Example #11
0
def test_make_swiss_roll():
    X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0)

    assert X.shape == (5, 3), "X shape mismatch"
    assert t.shape == (5, ), "t shape mismatch"
    assert_array_almost_equal(X[:, 0], t * np.cos(t))
    assert_array_almost_equal(X[:, 2], t * np.sin(t))
Example #12
0
def test_enet_l1_ratio():
    # Test that an error message is raised if an estimator that
    # uses _alpha_grid is called with l1_ratio=0
    msg = ("Automatic alpha grid generation is not supported for l1_ratio=0. "
           "Please supply a grid by providing your estimator with the "
           "appropriate `alphas=` argument.")
    X = np.array([[1, 2, 4, 5, 8], [3, 5, 7, 7, 8]]).T
    y = np.array([12, 10, 11, 21, 5])

    assert_raise_message(ValueError, msg,
                         ElasticNetCV(l1_ratio=0, random_state=42).fit, X, y)
    assert_raise_message(
        ValueError, msg,
        MultiTaskElasticNetCV(l1_ratio=0, random_state=42).fit, X, y[:, None])

    # Test that l1_ratio=0 is allowed if we supply a grid manually
    alphas = [0.1, 10]
    estkwds = {'alphas': alphas, 'random_state': 42}
    est_desired = ElasticNetCV(l1_ratio=0.00001, **estkwds)
    est = ElasticNetCV(l1_ratio=0, **estkwds)
    with ignore_warnings():
        est_desired.fit(X, y)
        est.fit(X, y)
    assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)

    est_desired = MultiTaskElasticNetCV(l1_ratio=0.00001, **estkwds)
    est = MultiTaskElasticNetCV(l1_ratio=0, **estkwds)
    with ignore_warnings():
        est.fit(X, y[:, None])
        est_desired.fit(X, y[:, None])
    assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
                          tol_support):

    rand_gen = np.random.RandomState(0)
    data = rand_gen.randn(n_samples, n_features)
    # add some outliers
    outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
    outliers_offset = 10. * \
        (rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
    data[outliers_index] += outliers_offset
    inliers_mask = np.ones(n_samples).astype(bool)
    inliers_mask[outliers_index] = False

    pure_data = data[inliers_mask]
    # compute MCD by fitting an object
    mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
    T = mcd_fit.location_
    S = mcd_fit.covariance_
    H = mcd_fit.support_
    # compare with the estimates learnt from the inliers
    error_location = np.mean((pure_data.mean(0) - T) ** 2)
    assert(error_location < tol_loc)
    error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
    assert(error_cov < tol_cov)
    assert(np.sum(H) >= tol_support)
    assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
Example #14
0
def test_inverse_transform():
    # Test FastICA.inverse_transform
    n_features = 10
    n_samples = 100
    n1, n2 = 5, 10
    rng = np.random.RandomState(0)
    X = rng.random_sample((n_samples, n_features))
    expected = {
        (True, n1): (n_features, n1),
        (True, n2): (n_features, n2),
        (False, n1): (n_features, n2),
        (False, n2): (n_features, n2)
    }
    for whiten in [True, False]:
        for n_components in [n1, n2]:
            n_components_ = (n_components
                             if n_components is not None else X.shape[1])
            ica = FastICA(n_components=n_components,
                          random_state=rng,
                          whiten=whiten)
            with warnings.catch_warnings(record=True):
                # catch "n_components ignored" warning
                Xt = ica.fit_transform(X)
            expected_shape = expected[(whiten, n_components_)]
            assert ica.mixing_.shape == expected_shape
            X2 = ica.inverse_transform(Xt)
            assert X.shape == X2.shape

            # reversibility test in non-reduction case
            if n_components == X.shape[1]:
                assert_array_almost_equal(X, X2)
def test_suffstat_sk_tied():
    # use equation Nk * Sk / N = S_tied
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 500, 2, 2

    resp = rng.rand(n_samples, n_components)
    resp = resp / resp.sum(axis=1)[:, np.newaxis]
    X = rng.rand(n_samples, n_features)
    nk = resp.sum(axis=0)
    xk = np.dot(resp.T, X) / nk[:, np.newaxis]

    covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    covars_pred_full = np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full,
                              0) / n_samples

    covars_pred_tied = _estimate_gaussian_covariances_tied(resp, X, nk, xk, 0)

    ecov = EmpiricalCovariance()
    ecov.covariance_ = covars_pred_full
    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred_tied, 'tied')
    precs_pred = np.dot(precs_chol_pred, precs_chol_pred.T)
    precs_est = linalg.inv(covars_pred_tied)
    assert_array_almost_equal(precs_est, precs_pred)
Example #16
0
def test_binary_perplexity_stability():
    # Binary perplexity search should be stable.
    # The binary_search_perplexity had a bug wherein the P array
    # was uninitialized, leading to sporadically failing tests.
    n_neighbors = 10
    n_samples = 100
    random_state = check_random_state(0)
    data = random_state.randn(n_samples, 5)
    nn = NearestNeighbors().fit(data)
    distance_graph = nn.kneighbors_graph(n_neighbors=n_neighbors,
                                         mode='distance')
    distances = distance_graph.data.astype(np.float32, copy=False)
    distances = distances.reshape(n_samples, n_neighbors)
    last_P = None
    desired_perplexity = 3
    for _ in range(100):
        P = _binary_search_perplexity(distances.copy(),
                                      desired_perplexity,
                                      verbose=0)
        P1 = _joint_probabilities_nn(distance_graph,
                                     desired_perplexity,
                                     verbose=0)
        # Convert the sparse matrix to a dense one for testing
        P1 = P1.toarray()
        if last_P is None:
            last_P = P
            last_P1 = P1
        else:
            assert_array_almost_equal(P, last_P, decimal=4)
            assert_array_almost_equal(P1, last_P1, decimal=4)
def test_gaussian_mixture_estimate_log_prob_resp():
    # test whether responsibilities are normalized
    rng = np.random.RandomState(0)
    rand_data = RandomData(rng, scale=5)
    n_samples = rand_data.n_samples
    n_features = rand_data.n_features
    n_components = rand_data.n_components

    X = rng.rand(n_samples, n_features)
    for covar_type in COVARIANCE_TYPE:
        weights = rand_data.weights
        means = rand_data.means
        precisions = rand_data.precisions[covar_type]
        g = GaussianMixture(n_components=n_components,
                            random_state=rng,
                            weights_init=weights,
                            means_init=means,
                            precisions_init=precisions,
                            covariance_type=covar_type)
        g.fit(X)
        resp = g.predict_proba(X)
        assert_array_almost_equal(resp.sum(axis=1), np.ones(n_samples))
        assert_array_equal(g.weights_init, weights)
        assert_array_equal(g.means_init, means)
        assert_array_equal(g.precisions_init, precisions)
Example #18
0
def _run_answer_test(pos_input,
                     pos_output,
                     neighbors,
                     grad_output,
                     verbose=False,
                     perplexity=0.1,
                     skip_num_points=0):
    distances = pairwise_distances(pos_input).astype(np.float32)
    args = distances, perplexity, verbose
    pos_output = pos_output.astype(np.float32)
    neighbors = neighbors.astype(np.int64, copy=False)
    pij_input = _joint_probabilities(*args)
    pij_input = squareform(pij_input).astype(np.float32)
    grad_bh = np.zeros(pos_output.shape, dtype=np.float32)

    from scipy.sparse import csr_matrix
    P = csr_matrix(pij_input)

    neighbors = P.indices.astype(np.int64)
    indptr = P.indptr.astype(np.int64)

    _barnes_hut_tsne.gradient(P.data,
                              pos_output,
                              neighbors,
                              indptr,
                              grad_bh,
                              0.5,
                              2,
                              1,
                              skip_num_points=0)
    assert_array_almost_equal(grad_bh, grad_output, decimal=4)
Example #19
0
def test_classification_sample_weight():
    X = [[0], [0], [1]]
    y = [0, 1, 0]
    sample_weight = [0.1, 1., 0.1]

    clf = DummyClassifier(strategy="stratified").fit(X, y, sample_weight)
    assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1. / 1.2])
Example #20
0
def test_barnes_hut_angle():
    # When Barnes-Hut's angle=0 this corresponds to the exact method.
    angle = 0.0
    perplexity = 10
    n_samples = 100
    for n_components in [2, 3]:
        n_features = 5
        degrees_of_freedom = float(n_components - 1.0)

        random_state = check_random_state(0)
        data = random_state.randn(n_samples, n_features)
        distances = pairwise_distances(data)
        params = random_state.randn(n_samples, n_components)
        P = _joint_probabilities(distances, perplexity, verbose=0)
        kl_exact, grad_exact = _kl_divergence(params, P, degrees_of_freedom,
                                              n_samples, n_components)

        n_neighbors = n_samples - 1
        distances_csr = NearestNeighbors().fit(data).kneighbors_graph(
            n_neighbors=n_neighbors, mode='distance')
        P_bh = _joint_probabilities_nn(distances_csr, perplexity, verbose=0)
        kl_bh, grad_bh = _kl_divergence_bh(params,
                                           P_bh,
                                           degrees_of_freedom,
                                           n_samples,
                                           n_components,
                                           angle=angle,
                                           skip_num_points=0,
                                           verbose=0)

        P = squareform(P)
        P_bh = P_bh.toarray()
        assert_array_almost_equal(P_bh, P, decimal=5)
        assert_almost_equal(kl_exact, kl_bh, decimal=3)
Example #21
0
def test_transform():
    """Check transform method of VotingClassifier on toy dataset."""
    clf1 = LogisticRegression(random_state=123)
    clf2 = RandomForestClassifier(random_state=123)
    clf3 = GaussianNB()
    X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
    y = np.array([1, 1, 2, 2])

    eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft').fit(X, y)
    eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft',
                             flatten_transform=True).fit(X, y)
    eclf3 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                         ('gnb', clf3)],
                             voting='soft',
                             flatten_transform=False).fit(X, y)

    assert_array_equal(eclf1.transform(X).shape, (4, 6))
    assert_array_equal(eclf2.transform(X).shape, (4, 6))
    assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
    assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
    assert_array_almost_equal(
        eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X))
def test_oneclass_adaboost_proba():
    # Test predict_proba robustness for one class label input.
    # In response to issue #7501
    # https://github.com/scikit-learn/scikit-learn/issues/7501
    y_t = np.ones(len(X))
    clf = AdaBoostClassifier().fit(X, y_t)
    assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
Example #23
0
def test_kernel_pca_precomputed():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("dense", "arpack"):
        X_kpca = KernelPCA(4, eigen_solver=eigen_solver).\
            fit(X_fit).transform(X_pred)
        X_kpca2 = KernelPCA(4, eigen_solver=eigen_solver,
                            kernel='precomputed').fit(np.dot(
                                X_fit,
                                X_fit.T)).transform(np.dot(X_pred, X_fit.T))

        X_kpca_train = KernelPCA(4,
                                 eigen_solver=eigen_solver,
                                 kernel='precomputed').fit_transform(
                                     np.dot(X_fit, X_fit.T))
        X_kpca_train2 = KernelPCA(4,
                                  eigen_solver=eigen_solver,
                                  kernel='precomputed').fit(
                                      np.dot(X_fit, X_fit.T)).transform(
                                          np.dot(X_fit, X_fit.T))

        assert_array_almost_equal(np.abs(X_kpca), np.abs(X_kpca2))

        assert_array_almost_equal(np.abs(X_kpca_train), np.abs(X_kpca_train2))
Example #24
0
def test_collinearity():
    # Check that lars_path is robust to collinearity in input
    X = np.array([[3., 3., 1.], [2., 2., 0.], [1., 1., 0]])
    y = np.array([1., 0., 0])
    rng = np.random.RandomState(0)

    f = ignore_warnings
    _, _, coef_path_ = f(linear_model.lars_path)(X, y, alpha_min=0.01)
    assert not np.isnan(coef_path_).any()
    residual = np.dot(X, coef_path_[:, -1]) - y
    assert (residual**2).sum() < 1.  # just make sure it's bounded

    n_samples = 10
    X = rng.rand(n_samples, 5)
    y = np.zeros(n_samples)
    _, _, coef_path_ = linear_model.lars_path(X,
                                              y,
                                              Gram='auto',
                                              copy_X=False,
                                              copy_Gram=False,
                                              alpha_min=0.,
                                              method='lasso',
                                              verbose=0,
                                              max_iter=500)
    assert_array_almost_equal(coef_path_, np.zeros_like(coef_path_))
Example #25
0
def test_feature_union_weights():
    # test feature union with transformer weights
    X = iris.data
    y = iris.target
    pca = PCA(n_components=2, svd_solver='randomized', random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert X_fit_transformed_wo_method.shape == (X.shape[0], 7)
Example #26
0
def test_lasso_lars_vs_lasso_cd_ill_conditioned():
    # Test lasso lars on a very ill-conditioned design, and check that
    # it does not blow up, and stays somewhat close to a solution given
    # by the coordinate descent solver
    # Also test that lasso_path (using lars_path output style) gives
    # the same result as lars_path and previous lasso output style
    # under these conditions.
    rng = np.random.RandomState(42)

    # Generate data
    n, m = 70, 100
    k = 5
    X = rng.randn(n, m)
    w = np.zeros((m, 1))
    i = np.arange(0, m)
    rng.shuffle(i)
    supp = i[:k]
    w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1)
    y = np.dot(X, w)
    sigma = 0.2
    y += sigma * rng.rand(*y.shape)
    y = y.squeeze()
    lars_alphas, _, lars_coef = linear_model.lars_path(X, y, method='lasso')

    _, lasso_coef2, _ = linear_model.lasso_path(X,
                                                y,
                                                alphas=lars_alphas,
                                                tol=1e-6,
                                                fit_intercept=False)

    assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1)
Example #27
0
def test_multinomial_loss_ground_truth():
    # n_samples, n_features, n_classes = 4, 2, 3
    n_classes = 3
    X = np.array([[1.1, 2.2], [2.2, -4.4], [3.3, -2.2], [1.1, 1.1]])
    y = np.array([0, 1, 2, 0])
    lbin = LabelBinarizer()
    Y_bin = lbin.fit_transform(y)

    weights = np.array([[0.1, 0.2, 0.3], [1.1, 1.2, -1.3]])
    intercept = np.array([1., 0, -.2])
    sample_weights = np.array([0.8, 1, 1, 0.8])

    prediction = np.dot(X, weights) + intercept
    logsumexp_prediction = logsumexp(prediction, axis=1)
    p = prediction - logsumexp_prediction[:, np.newaxis]
    loss_1 = -(sample_weights[:, np.newaxis] * p * Y_bin).sum()
    diff = sample_weights[:, np.newaxis] * (np.exp(p) - Y_bin)
    grad_1 = np.dot(X.T, diff)

    weights_intercept = np.vstack((weights, intercept)).T.ravel()
    loss_2, grad_2, _ = _multinomial_loss_grad(weights_intercept, X, Y_bin,
                                               0.0, sample_weights)
    grad_2 = grad_2.reshape(n_classes, -1)
    grad_2 = grad_2[:, :-1].T

    assert_almost_equal(loss_1, loss_2)
    assert_array_almost_equal(grad_1, grad_2)

    # ground truth
    loss_gt = 11.680360354325961
    grad_gt = np.array([[-0.557487, -1.619151, +2.176638],
                        [-0.903942, +5.258745, -4.354803]])
    assert_almost_equal(loss_1, loss_gt)
    assert_array_almost_equal(grad_1, grad_gt)
Example #28
0
def test_multitarget():
    # Assure that estimators receiving multidimensional y do the right thing
    Y = np.vstack([y, y**2]).T
    n_targets = Y.shape[1]
    estimators = [
        linear_model.LassoLars(),
        linear_model.Lars(),
        # regression test for gh-1615
        linear_model.LassoLars(fit_intercept=False),
        linear_model.Lars(fit_intercept=False),
    ]

    for estimator in estimators:
        estimator.fit(X, Y)
        Y_pred = estimator.predict(X)
        alphas, active, coef, path = (estimator.alphas_, estimator.active_,
                                      estimator.coef_, estimator.coef_path_)
        for k in range(n_targets):
            estimator.fit(X, Y[:, k])
            y_pred = estimator.predict(X)
            assert_array_almost_equal(alphas[k], estimator.alphas_)
            assert_array_almost_equal(active[k], estimator.active_)
            assert_array_almost_equal(coef[k], estimator.coef_)
            assert_array_almost_equal(path[k], estimator.coef_path_)
            assert_array_almost_equal(Y_pred[:, k], y_pred)
Example #29
0
def test_lof_precomputed(random_state=42):
    """Tests LOF with a distance matrix."""
    # Note: smaller samples may result in spurious test success
    rng = np.random.RandomState(random_state)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((3, 4))
    DXX = metrics.pairwise_distances(X, metric='euclidean')
    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
    # As a feature matrix (n_samples by n_features)
    lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
    lof_X.fit(X)
    pred_X_X = lof_X._predict()
    pred_X_Y = lof_X.predict(Y)

    # As a dense distance matrix (n_samples by n_samples)
    lof_D = neighbors.LocalOutlierFactor(n_neighbors=3,
                                         algorithm='brute',
                                         metric='precomputed',
                                         novelty=True)
    lof_D.fit(DXX)
    pred_D_X = lof_D._predict()
    pred_D_Y = lof_D.predict(DYX)

    assert_array_almost_equal(pred_X_X, pred_D_X)
    assert_array_almost_equal(pred_X_Y, pred_D_Y)
Example #30
0
def test_2d_coef():
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0,
                                        n_classes=4)

    est = LogisticRegression()
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        for order in [1, 2, np.inf]:
            # Fit SelectFromModel a multi-class problem
            transformer = SelectFromModel(estimator=LogisticRegression(),
                                          threshold=threshold,
                                          norm_order=order)
            transformer.fit(X, y)
            assert hasattr(transformer.estimator_, 'coef_')
            X_new = transformer.transform(X)
            assert X_new.shape[1] < X.shape[1]

            # Manually check that the norm is correctly performed
            est.fit(X, y)
            importances = np.linalg.norm(est.coef_, axis=0, ord=order)
            feature_mask = importances > func(importances)
            assert_array_almost_equal(X_new, X[:, feature_mask])