Ejemplo n.º 1
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intended
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    # with SGDClassifier
    X = np.abs(np.random.randn(14, 2))
    y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3]

    ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
                                            shuffle=False, random_state=0))
    ovr.partial_fit(X[:7], y[:7], np.unique(y))
    ovr.partial_fit(X[7:], y[7:])
    pred = ovr.predict(X)
    ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None,
                                             shuffle=False, random_state=0))
    pred1 = ovr1.fit(X, y).predict(X)
    assert_equal(np.mean(pred == y), np.mean(pred1 == y))

    # test partial_fit only exists if estimator has it:
    ovr = OneVsRestClassifier(SVC())
    assert_false(hasattr(ovr, "partial_fit"))
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
                 init=centers)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
def check_clustering(name, Alg):
    X, y = make_blobs(n_samples=50, random_state=1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    n_samples, n_features = X.shape
    # catch deprecation and neighbors warnings
    with warnings.catch_warnings(record=True):
        alg = Alg()
    set_fast_parameters(alg)
    if hasattr(alg, "n_clusters"):
        alg.set_params(n_clusters=3)
    set_random_state(alg)
    if name == 'AffinityPropagation':
        alg.set_params(preference=-100)
        alg.set_params(max_iter=100)

    # fit
    alg.fit(X)
    # with lists
    alg.fit(X.tolist())

    assert_equal(alg.labels_.shape, (n_samples,))
    pred = alg.labels_
    assert_greater(adjusted_rand_score(pred, y), 0.4)
    # fit another time with ``fit_predict`` and compare results
    if name is 'SpectralClustering':
        # there is no way to make Spectral clustering deterministic :(
        return
    set_random_state(alg)
    with warnings.catch_warnings(record=True):
        pred2 = alg.fit_predict(X)
    assert_array_equal(pred, pred2)
def test_euclidean_distances():
    # Check the pairwise Euclidean distances computation
    X = [[0]]
    Y = [[1], [2]]
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    X = csr_matrix(X)
    Y = csr_matrix(Y)
    D = euclidean_distances(X, Y)
    assert_array_almost_equal(D, [[1., 2.]])

    rng = np.random.RandomState(0)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((20, 4))
    X_norm_sq = (X ** 2).sum(axis=1).reshape(1, -1)
    Y_norm_sq = (Y ** 2).sum(axis=1).reshape(1, -1)

    # check that we still get the right answers with {X,Y}_norm_squared
    D1 = euclidean_distances(X, Y)
    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq,
                             Y_norm_squared=Y_norm_sq)
    assert_array_almost_equal(D2, D1)
    assert_array_almost_equal(D3, D1)
    assert_array_almost_equal(D4, D1)

    # check we get the wrong answer with wrong {X,Y}_norm_squared
    X_norm_sq *= 0.5
    Y_norm_sq *= 0.5
    wrong_D = euclidean_distances(X, Y,
                                  X_norm_squared=np.zeros_like(X_norm_sq),
                                  Y_norm_squared=np.zeros_like(Y_norm_sq))
    assert_greater(np.max(np.abs(wrong_D - D1)), .01)
def check_regressors_train(name, Regressor):
    X, y = _boston_subset()
    y = StandardScaler().fit_transform(y)   # X is already scaled
    y = multioutput_estimator_convert_y_2d(name, y)
    rnd = np.random.RandomState(0)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        regressor = Regressor()
    set_fast_parameters(regressor)
    if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'):
        # linear regressors need to set alpha, but not generalized CV ones
        regressor.alpha = 0.01
    if name == 'PassiveAggressiveRegressor':
        regressor.C = 0.01

    # raises error on malformed input for fit
    assert_raises(ValueError, regressor.fit, X, y[:-1])
    # fit
    if name in CROSS_DECOMPOSITION:
        y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y
    set_random_state(regressor)
    regressor.fit(X, y_)
    regressor.fit(X.tolist(), y_.tolist())
    regressor.predict(X)

    # TODO: find out why PLS and CCA fail. RANSAC is random
    # and furthermore assumes the presence of outliers, hence
    # skipped
    if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'):
        print(regressor)
        assert_greater(regressor.score(X, y_), 0.5)
Ejemplo n.º 6
0
def check_min_samples_leaf(name):
    X, y = hastie_X, hastie_y

    # Test if leaves contain more than leaf_count training examples
    ForestEstimator = FOREST_ESTIMATORS[name]

    # test boundary value
    assert_raises(ValueError,
                  ForestEstimator(min_samples_leaf=-1).fit, X, y)
    assert_raises(ValueError,
                  ForestEstimator(min_samples_leaf=0).fit, X, y)

    est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0)
    est.fit(X, y)
    out = est.estimators_[0].tree_.apply(X)
    node_counts = np.bincount(out)
    # drop inner nodes
    leaf_count = node_counts[node_counts != 0]
    assert_greater(np.min(leaf_count), 4,
                   "Failed with {0}".format(name))

    est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1,
                          random_state=0)
    est.fit(X, y)
    out = est.estimators_[0].tree_.apply(X)
    node_counts = np.bincount(out)
    # drop inner nodes
    leaf_count = node_counts[node_counts != 0]
    assert_greater(np.min(leaf_count), len(X) * 0.25 - 1,
                   "Failed with {0}".format(name))
def check_class_weight_classifiers(name, Classifier):
    if name == "NuSVC":
        # the sparse version has a parameter that doesn't do anything
        raise SkipTest
    if name.endswith("NB"):
        # NaiveBayes classifiers have a somewhat different interface.
        # FIXME SOON!
        raise SkipTest

    for n_centers in [2, 3]:
        # create a very noisy dataset
        X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                            random_state=0)
        n_centers = len(np.unique(y_train))

        if n_centers == 2:
            class_weight = {0: 1000, 1: 0.0001}
        else:
            class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}

        with warnings.catch_warnings(record=True):
            classifier = Classifier(class_weight=class_weight)
        if hasattr(classifier, "n_iter"):
            classifier.set_params(n_iter=100)
        if hasattr(classifier, "min_weight_fraction_leaf"):
            classifier.set_params(min_weight_fraction_leaf=0.01)

        set_random_state(classifier)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        assert_greater(np.mean(y_pred == 0), 0.89)
Ejemplo n.º 8
0
def test_oneclass_decision_function():
    # Test OneClassSVM decision function
    clf = svm.OneClassSVM()
    rnd = check_random_state(2)

    # Generate train data
    X = 0.3 * rnd.randn(100, 2)
    X_train = np.r_[X + 2, X - 2]

    # Generate some regular novel observations
    X = 0.3 * rnd.randn(20, 2)
    X_test = np.r_[X + 2, X - 2]
    # Generate some abnormal novel observations
    X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2))

    # fit the model
    clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
    clf.fit(X_train)

    # predict things
    y_pred_test = clf.predict(X_test)
    assert_greater(np.mean(y_pred_test == 1), .9)
    y_pred_outliers = clf.predict(X_outliers)
    assert_greater(np.mean(y_pred_outliers == -1), .9)
    dec_func_test = clf.decision_function(X_test)
    assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1)
    dec_func_outliers = clf.decision_function(X_outliers)
    assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1)
Ejemplo n.º 9
0
def check_min_samples_split(name):
    X, y = hastie_X, hastie_y
    ForestEstimator = FOREST_ESTIMATORS[name]

    # test boundary value
    assert_raises(ValueError,
                  ForestEstimator(min_samples_split=-1).fit, X, y)
    assert_raises(ValueError,
                  ForestEstimator(min_samples_split=0).fit, X, y)
    assert_raises(ValueError,
                  ForestEstimator(min_samples_split=1.1).fit, X, y)

    est = ForestEstimator(min_samples_split=10, n_estimators=1, random_state=0)
    est.fit(X, y)
    node_idx = est.estimators_[0].tree_.children_left != -1
    node_samples = est.estimators_[0].tree_.n_node_samples[node_idx]

    assert_greater(np.min(node_samples), len(X) * 0.5 - 1,
                   "Failed with {0}".format(name))

    est = ForestEstimator(min_samples_split=0.5, n_estimators=1, random_state=0)
    est.fit(X, y)
    node_idx = est.estimators_[0].tree_.children_left != -1
    node_samples = est.estimators_[0].tree_.n_node_samples[node_idx]

    assert_greater(np.min(node_samples), len(X) * 0.5 - 1,
                   "Failed with {0}".format(name))
Ejemplo n.º 10
0
def test_randomized_svd_low_rank_with_noise():
    """Check that extmath.randomized_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # generate a matrix X wity structure approximate rank `rank` and an
    # important noisy component
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=0.5,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = randomized_svd(X, k, n_iter=0)

    # the approximation does not tolerate the noise:
    assert_greater(np.abs(s[:k] - sa).max(), 0.05)

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = randomized_svd(X, k, n_iter=5)

    # the iterated power method is helping getting rid of the noise:
    assert_almost_equal(s[:k], sap, decimal=3)
Ejemplo n.º 11
0
def test_randomized_svd_infinite_rank():
    """Check that extmath.randomized_svd can handle noisy matrices"""
    n_samples = 100
    n_features = 500
    rank = 5
    k = 10

    # let us try again without 'low_rank component': just regularly but slowly
    # decreasing singular values: the rank of the data matrix is infinite
    X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features,
                             effective_rank=rank, tail_strength=1.0,
                             random_state=0)
    assert_equal(X.shape, (n_samples, n_features))

    # compute the singular values of X using the slow exact method
    _, s, _ = linalg.svd(X, full_matrices=False)

    # compute the singular values of X using the fast approximate method
    # without the iterated power method
    _, sa, _ = randomized_svd(X, k, n_iter=0)

    # the approximation does not tolerate the noise:
    assert_greater(np.abs(s[:k] - sa).max(), 0.1)

    # compute the singular values of X using the fast approximate method with
    # iterated power method
    _, sap, _ = randomized_svd(X, k, n_iter=5)

    # the iterated power method is still managing to get most of the structure
    # at the requested rank
    assert_almost_equal(s[:k], sap, decimal=3)
Ejemplo n.º 12
0
def test_predict_iris():
    # Test logistic regression with the iris dataset
    n_samples, n_features = iris.data.shape

    target = iris.target_names[iris.target]

    # Test that both multinomial and OvR solvers handle
    # multiclass data correctly and give good accuracy
    # score (>0.95) for the training data.
    for clf in [LogisticRegression(C=len(iris.data)),
                LogisticRegression(C=len(iris.data), solver='lbfgs',
                                   multi_class='multinomial'),
                LogisticRegression(C=len(iris.data), solver='newton-cg',
                                   multi_class='multinomial'),
                LogisticRegression(C=len(iris.data), solver='sag', tol=1e-2,
                                   multi_class='ovr', random_state=42),
                LogisticRegression(C=len(iris.data), solver='saga', tol=1e-2,
                                   multi_class='ovr', random_state=42)
                ]:
        clf.fit(iris.data, target)
        assert_array_equal(np.unique(target), clf.classes_)

        pred = clf.predict(iris.data)
        assert_greater(np.mean(pred == target), .95)

        probabilities = clf.predict_proba(iris.data)
        assert_array_almost_equal(probabilities.sum(axis=1),
                                  np.ones(n_samples))

        pred = iris.target_names[probabilities.argmax(axis=1)]
        assert_greater(np.mean(pred == target), .95)
Ejemplo n.º 13
0
def test_spectral_amg_mode():
    # Test the amg mode of SpectralClustering
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    try:
        from pyamg import smoothed_aggregation_solver
        amg_loaded = True
    except ImportError:
        amg_loaded = False
    if amg_loaded:
        labels = spectral_clustering(S, n_clusters=len(centers),
                                     random_state=0, mode="amg")
        # We don't care too much that it's good, just that it *worked*.
        # There does have to be some lower limit on the performance though.
        assert_greater(np.mean(labels == true_labels), .3)
    else:
        assert_raises(ValueError, spectral_embedding, S,
                      n_components=len(centers), random_state=0, mode="amg")
Ejemplo n.º 14
0
def test_ovo_ties():
    # test that ties are broken using the decision function, not defaulting to
    # the smallest label
    X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]])
    y = np.array([2, 0, 1, 2])
    multi_clf = OneVsOneClassifier(Perceptron())
    ovo_prediction = multi_clf.fit(X, y).predict(X)

    # recalculate votes to make sure we have a tie
    predictions = np.vstack([clf.predict(X) for clf in multi_clf.estimators_])
    scores = np.vstack([clf.decision_function(X)
                        for clf in multi_clf.estimators_])
    # classifiers are in order 0-1, 0-2, 1-2
    # aggregate votes:
    votes = np.zeros((4, 3))
    votes[np.arange(4), predictions[0]] += 1
    votes[np.arange(4), 2 * predictions[1]] += 1
    votes[np.arange(4), 1 + predictions[2]] += 1
    # for the first point, there is one vote per class
    assert_array_equal(votes[0, :], 1)
    # for the rest, there is no tie and the prediction is the argmax
    assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:])
    # for the tie, the prediction is the class with the highest score
    assert_equal(ovo_prediction[0], 1)
    # score for one is greater than score for zero
    assert_greater(scores[2, 0] - scores[0, 0], scores[0, 0] + scores[1, 0])
    # score for one is greater than score for two
    assert_greater(scores[2, 0] - scores[0, 0], -scores[1, 0] - scores[2, 0])
Ejemplo n.º 15
0
def test_warm_start(solver, warm_start, fit_intercept, multi_class):
    # A 1-iteration second fit on same data should give almost same result
    # with warm starting, and quite different result without warm starting.
    # Warm starting does not work with liblinear solver.
    X, y = iris.data, iris.target

    clf = LogisticRegression(tol=1e-4, multi_class=multi_class,
                             warm_start=warm_start,
                             solver=solver,
                             random_state=42, max_iter=100,
                             fit_intercept=fit_intercept)
    with ignore_warnings(category=ConvergenceWarning):
        clf.fit(X, y)
        coef_1 = clf.coef_

        clf.max_iter = 1
        clf.fit(X, y)
    cum_diff = np.sum(np.abs(coef_1 - clf.coef_))
    msg = ("Warm starting issue with %s solver in %s mode "
           "with fit_intercept=%s and warm_start=%s"
           % (solver, multi_class, str(fit_intercept),
              str(warm_start)))
    if warm_start:
        assert_greater(2.0, cum_diff, msg)
    else:
        assert_greater(cum_diff, 2.0, msg)
Ejemplo n.º 16
0
def test_enet_path():
    # We use a large number of samples and of informative features so that
    # the l1_ratio selected is more toward ridge than lasso
    X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100, n_informative_features=100)
    max_iter = 150

    with warnings.catch_warnings():
        # Here we have a small number of iterations, and thus the
        # ElasticNet might not converge. This is to speed up tests
        warnings.simplefilter("ignore", UserWarning)
        clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter)
        clf.fit(X, y)
        # Well-conditionned settings, we should have selected our
        # smallest penalty
        assert_almost_equal(clf.alpha_, min(clf.alphas_))
        # Non-sparse ground truth: we should have seleted an elastic-net
        # that is closer to ridge than to lasso
        assert_equal(clf.l1_ratio_, min(clf.l1_ratio))

        clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter, precompute=True)
        clf.fit(X, y)

    # Well-conditionned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have seleted an elastic-net
    # that is closer to ridge than to lasso
    assert_equal(clf.l1_ratio_, min(clf.l1_ratio))

    # We are in well-conditionned settings with low noise: we should
    # have a good test-set performance
    assert_greater(clf.score(X_test, y_test), 0.99)
Ejemplo n.º 17
0
def test_rfe_estimator_tags():
    rfe = RFE(SVC(kernel='linear'))
    assert_equal(rfe._estimator_type, "classifier")
    # make sure that cross-validation is stratified
    iris = load_iris()
    score = cross_val_score(rfe, iris.data, iris.target)
    assert_greater(score.min(), .7)
Ejemplo n.º 18
0
def check_regressors_train(name, Regressor, X, y):
    if name == 'OrthogonalMatchingPursuitCV':
        # FIXME: This test is unstable on Travis, see issue #3190.
        check_skip_travis()
    rnd = np.random.RandomState(0)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        regressor = Regressor()
    if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'):
        # linear regressors need to set alpha, but not generalized CV ones
        regressor.alpha = 0.01

    # raises error on malformed input for fit
    assert_raises(ValueError, regressor.fit, X, y[:-1])
    # fit
    if name in ('PLSCanonical', 'PLSRegression', 'CCA'):
        y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y
    set_random_state(regressor)
    regressor.fit(X, y_)
    regressor.predict(X)

      # TODO: find out why PLS and CCA fail. RANSAC is random
      # and furthermore assumes the presence of outliers, hence
      # skipped
    if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'):
        assert_greater(regressor.score(X, y_), 0.5)
Ejemplo n.º 19
0
def test_k_means_function():
    # test calling the k_means function directly
    # catch output
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
                                                   sample_weight=None,
                                                   verbose=True)
    finally:
        sys.stdout = old_stdout
    centers = cluster_centers
    assert_equal(centers.shape, (n_clusters, n_features))

    labels = labels
    assert_equal(np.unique(labels).shape[0], n_clusters)

    # check that the labels assignment are perfect (up to a permutation)
    assert_equal(v_measure_score(true_labels, labels), 1.0)
    assert_greater(inertia, 0.0)

    # check warning when centers are passed
    assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
                 sample_weight=None, init=centers)

    # to many clusters desired
    assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1,
                  sample_weight=None)

    # kmeans for algorithm='elkan' raises TypeError on sparse matrix
    assert_raise_message(TypeError, "algorithm='elkan' not supported for "
                         "sparse input X", k_means, X=X_csr, n_clusters=2,
                         sample_weight=None, algorithm="elkan")
def test_lasso_cv():
    X, y, X_test, y_test = build_dataset()
    max_iter = 150
    clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter).fit(X, y)
    assert_almost_equal(clf.alpha_, 0.056, 2)

    clf = LassoCV(n_alphas=10, eps=1e-3, max_iter=max_iter, precompute=True)
    clf.fit(X, y)
    assert_almost_equal(clf.alpha_, 0.056, 2)

    # Check that the lars and the coordinate descent implementation
    # select a similar alpha
    lars = LassoLarsCV(normalize=False, max_iter=30).fit(X, y)
    # for this we check that they don't fall in the grid of
    # clf.alphas further than 1
    assert_true(np.abs(
        np.searchsorted(clf.alphas_[::-1], lars.alpha_) -
        np.searchsorted(clf.alphas_[::-1], clf.alpha_)) <= 1)
    # check that they also give a similar MSE
    mse_lars = interpolate.interp1d(lars.cv_alphas_, lars.cv_mse_path_.T)
    np.testing.assert_approx_equal(mse_lars(clf.alphas_[5]).mean(),
                                   clf.mse_path_[5].mean(), significant=2)

    # test set
    assert_greater(clf.score(X_test, y_test), 0.99)
Ejemplo n.º 21
0
def check_classifiers_classes(name, Classifier, X, y, y_names):
    if name in ["LabelPropagation", "LabelSpreading"]:
        # TODO some complication with -1 label
        y_ = y
    else:
        y_ = y_names

    classes = np.unique(y_)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        classifier = Classifier()
    # fit
    try:
        classifier.fit(X, y_)
    except Exception as e:
        print(e)

    y_pred = classifier.predict(X)
    # training set performance
    assert_array_equal(np.unique(y_), np.unique(y_pred))
    accuracy = accuracy_score(y_, y_pred)
    assert_greater(accuracy, 0.78,
                   "accuracy %f of %s not greater than 0.78"
                   % (accuracy, name))
    #assert_array_equal(
        #clf.classes_, classes,
        #"Unexpected classes_ attribute for %r" % clf)
    if np.any(classifier.classes_ != classes):
        print("Unexpected classes_ attribute for %r: "
              "expected %s, got %s" %
              (classifier, classes, classifier.classes_))
Ejemplo n.º 22
0
def test_nmf_decreasing():
    # test that the objective function is decreasing at each iteration
    n_samples = 20
    n_features = 15
    n_components = 10
    alpha = 0.1
    l1_ratio = 0.5
    tol = 0.

    # initialization
    rng = np.random.mtrand.RandomState(42)
    X = rng.randn(n_samples, n_features)
    np.abs(X, X)
    W0, H0 = nmf._initialize_nmf(X, n_components, init='random',
                                 random_state=42)

    for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5):
        for solver in ('cd', 'mu'):
            if solver != 'mu' and beta_loss != 2:
                # not implemented
                continue
            W, H = W0.copy(), H0.copy()
            previous_loss = None
            for _ in range(30):
                # one more iteration starting from the previous results
                W, H, _ = non_negative_factorization(
                    X, W, H, beta_loss=beta_loss, init='custom',
                    n_components=n_components, max_iter=1, alpha=alpha,
                    solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0,
                    regularization='both', random_state=0, update_H=True)

                loss = nmf._beta_divergence(X, W, H, beta_loss)
                if previous_loss is not None:
                    assert_greater(previous_loss, loss)
                previous_loss = loss
Ejemplo n.º 23
0
def test_class_weight_classifiers():
    # test that class_weight works and that the semantics are consistent
    classifiers = all_estimators(type_filter="classifier")

    with warnings.catch_warnings(record=True):
        classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()]

    for n_centers in [2, 3]:
        # create a very noisy dataset
        X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
        for name, Classifier in classifiers:
            if name == "NuSVC":
                # the sparse version has a parameter that doesn't do anything
                continue
            if name.endswith("NB"):
                # NaiveBayes classifiers have a somewhat different interface.
                # FIXME SOON!
                continue
            if n_centers == 2:
                class_weight = {0: 1000, 1: 0.0001}
            else:
                class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}

            with warnings.catch_warnings(record=True):
                classifier = Classifier(class_weight=class_weight)
            if hasattr(classifier, "n_iter"):
                classifier.set_params(n_iter=100)

            set_random_state(classifier)
            classifier.fit(X_train, y_train)
            y_pred = classifier.predict(X_test)
            assert_greater(np.mean(y_pred == 0), 0.9)
Ejemplo n.º 24
0
def test_ovo_partial_fit_predict():
    X, y = shuffle(iris.data, iris.target)
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(X[:100], y[:100], np.unique(y))
    ovo1.partial_fit(X[100:], y[100:])
    pred1 = ovo1.predict(X)

    ovo2 = OneVsOneClassifier(MultinomialNB())
    ovo2.fit(X, y)
    pred2 = ovo2.predict(X)
    assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2)
    assert_greater(np.mean(y == pred1), 0.65)
    assert_almost_equal(pred1, pred2)

    # Test when mini-batches don't have all target classes
    ovo1 = OneVsOneClassifier(MultinomialNB())
    ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovo1.partial_fit(iris.data[60:], iris.target[60:])
    pred1 = ovo1.predict(iris.data)
    ovo2 = OneVsOneClassifier(MultinomialNB())
    pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data)

    assert_almost_equal(pred1, pred2)
    assert_equal(len(ovo1.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred1), 0.65)
Ejemplo n.º 25
0
def test_ovr_partial_fit():
    # Test if partial_fit is working as intented
    X, y = shuffle(iris.data, iris.target, random_state=0)
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(X[:100], y[:100], np.unique(y))
    ovr.partial_fit(X[100:], y[100:])
    pred = ovr.predict(X)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(X, y).predict(X)

    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(y)))
    assert_greater(np.mean(y == pred), 0.65)

    # Test when mini batches doesn't have all classes
    ovr = OneVsRestClassifier(MultinomialNB())
    ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target))
    ovr.partial_fit(iris.data[60:], iris.target[60:])
    pred = ovr.predict(iris.data)
    ovr2 = OneVsRestClassifier(MultinomialNB())
    pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data)
    
    assert_almost_equal(pred, pred2)
    assert_equal(len(ovr.estimators_), len(np.unique(iris.target)))
    assert_greater(np.mean(iris.target == pred), 0.65)
Ejemplo n.º 26
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    classifiers = all_estimators(type_filter='classifier')
    X, y = make_blobs(random_state=12345)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    y = 2 * y + 1
    classes = np.unique(y)
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78,
                       "accuracy of %s not greater than 0.78" % str(Clf))
        assert_array_equal(
            clf.classes_, classes,
            "Unexpected classes_ attribute for %r" % clf)
Ejemplo n.º 27
0
def test_warm_start():
    # A 1-iteration second fit on same data should give almost same result
    # with warm starting, and quite different result without warm starting.
    # Warm starting does not work with liblinear solver.
    X, y = iris.data, iris.target

    solvers = ['newton-cg', 'sag']
    # old scipy doesn't have maxiter
    if sp_version >= (0, 12):
        solvers.append('lbfgs')

    for warm_start in [True, False]:
        for fit_intercept in [True, False]:
            for solver in solvers:
                for multi_class in ['ovr', 'multinomial']:
                    clf = LogisticRegression(tol=1e-4, multi_class=multi_class,
                                             warm_start=warm_start,
                                             solver=solver,
                                             random_state=42, max_iter=100,
                                             fit_intercept=fit_intercept)
                    clf.fit(X, y)
                    coef_1 = clf.coef_

                    clf.max_iter = 1
                    with ignore_warnings():
                        clf.fit(X, y)
                    cum_diff = np.sum(np.abs(coef_1 - clf.coef_))
                    msg = ("Warm starting issue with %s solver in %s mode "
                           "with fit_intercept=%s and warm_start=%s"
                           % (solver, multi_class, str(fit_intercept),
                              str(warm_start)))
                    if warm_start:
                        assert_greater(2.0, cum_diff, msg)
                    else:
                        assert_greater(cum_diff, 2.0, msg)
Ejemplo n.º 28
0
def test_classifiers_classes():
    # test if classifiers can cope with non-consecutive classes
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators if issubclass(E,
        ClassifierMixin)]
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    y = 2 * y + 1
    # TODO: make work with next line :)
    #y = y.astype(np.str)
    for name, Clf in classifiers:
        if Clf in dont_test or Clf in meta_estimators:
            continue
        if Clf in [MultinomialNB, BernoulliNB]:
            # TODO also test these!
            continue

        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            clf = Clf()
        # fit
        clf.fit(X, y)
        y_pred = clf.predict(X)
        # training set performance
        assert_array_equal(np.unique(y), np.unique(y_pred))
        assert_greater(zero_one_score(y, y_pred), 0.78)
Ejemplo n.º 29
0
def test_lml_improving():
    """ Test that hyperparameter-tuning improves log-marginal likelihood. """
    for kernel in kernels:
        if kernel == fixed_kernel: continue
        gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
        assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta),
                       gpr.log_marginal_likelihood(kernel.theta))
Ejemplo n.º 30
0
def test_fit_linear_multi():
    for data in (mult_dense, mult_sparse):
        clf = LinearSVC(random_state=0)
        clf.fit(data, mult_target)
        y_pred = clf.predict(data)
        acc = np.mean(y_pred == mult_target)
        assert_greater(acc, 0.85)
Ejemplo n.º 31
0
def test_enet_path():
    # We use a large number of samples and of informative features so that
    # the l1_ratio selected is more toward ridge than lasso
    X, y, X_test, y_test = build_dataset(n_samples=200,
                                         n_features=100,
                                         n_informative_features=100)
    max_iter = 150

    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests
    clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1],
                       eps=2e-3,
                       l1_ratio=[0.5, 0.7],
                       cv=3,
                       max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have seleted an elastic-net
    # that is closer to ridge than to lasso
    assert_equal(clf.l1_ratio_, min(clf.l1_ratio))

    clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1],
                       eps=2e-3,
                       l1_ratio=[0.5, 0.7],
                       cv=3,
                       max_iter=max_iter,
                       precompute=True)
    ignore_warnings(clf.fit)(X, y)

    # Well-conditioned settings, we should have selected our
    # smallest penalty
    assert_almost_equal(clf.alpha_, min(clf.alphas_))
    # Non-sparse ground truth: we should have seleted an elastic-net
    # that is closer to ridge than to lasso
    assert_equal(clf.l1_ratio_, min(clf.l1_ratio))

    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert_greater(clf.score(X_test, y_test), 0.99)

    # Multi-output/target case
    X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3)
    clf = MultiTaskElasticNetCV(n_alphas=5,
                                eps=2e-3,
                                l1_ratio=[0.5, 0.7],
                                cv=3,
                                max_iter=max_iter)
    ignore_warnings(clf.fit)(X, y)
    # We are in well-conditioned settings with low noise: we should
    # have a good test-set performance
    assert_greater(clf.score(X_test, y_test), 0.99)
    assert_equal(clf.coef_.shape, (3, 10))

    # Mono-output should have same cross-validated alpha_ and l1_ratio_
    # in both cases.
    X, y, _, _ = build_dataset(n_features=10)
    clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf1.fit(X, y)
    clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7])
    clf2.fit(X, y[:, np.newaxis])
    assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_)
    assert_almost_equal(clf1.alpha_, clf2.alpha_)
Ejemplo n.º 32
0
def test_kfold_can_detect_dependent_samples_on_digits():  # see #2372
    # The digits samples are dependent: they are apparently grouped by authors
    # although we don't have any information on the groups segment locations
    # for this data. We can highlight this fact be computing k-fold cross-
    # validation with and without shuffling: we observe that the shuffling case
    # wrongly makes the IID assumption and is therefore too optimistic: it
    # estimates a much higher accuracy (around 0.96) than the non
    # shuffling variant (around 0.86).

    digits = load_digits()
    X, y = digits.data[:800], digits.target[:800]
    model = SVC(C=10, gamma=0.005)
    n = len(y)

    cv = cval.KFold(n, 5, shuffle=False)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)

    # Shuffling the data artificially breaks the dependency and hides the
    # overfitting of the model with regards to the writing style of the authors
    # by yielding a seriously overestimated score:

    cv = cval.KFold(n, 5, shuffle=True, random_state=0)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    cv = cval.KFold(n, 5, shuffle=True, random_state=1)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(mean_score, 0.95)

    # Similarly, StratifiedKFold should try to shuffle the data as little
    # as possible (while respecting the balanced class constraints)
    # and thus be able to detect the dependency by not overestimating
    # the CV score either. As the digits dataset is approximately balanced
    # the estimated mean score is close to the score measured with
    # non-shuffled KFold

    cv = cval.StratifiedKFold(y, 5)
    mean_score = cval.cross_val_score(model, X, y, cv=cv).mean()
    assert_greater(0.88, mean_score)
    assert_greater(mean_score, 0.85)
Ejemplo n.º 33
0
def test_ovr_fit_predict_svc():
    ovr = OneVsRestClassifier(svm.SVC())
    ovr.fit(iris.data, iris.target)
    assert_equal(len(ovr.estimators_), 3)
    assert_greater(ovr.score(iris.data, iris.target), .9)
Ejemplo n.º 34
0
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = StratifiedKFold(2)

    score, scores, pvalue = permutation_test_score(svm,
                                                   X,
                                                   y,
                                                   n_permutations=30,
                                                   cv=cv,
                                                   scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_group, _, pvalue_group = permutation_test_score(svm,
                                                          X,
                                                          y,
                                                          n_permutations=30,
                                                          cv=cv,
                                                          scoring="accuracy",
                                                          groups=np.ones(
                                                              y.size),
                                                          random_state=0)
    assert_true(score_group == score)
    assert_true(pvalue_group == pvalue)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = StratifiedKFold(2)
    score_group, _, pvalue_group = permutation_test_score(svm_sparse,
                                                          X_sparse,
                                                          y,
                                                          n_permutations=30,
                                                          cv=cv_sparse,
                                                          scoring="accuracy",
                                                          groups=np.ones(
                                                              y.size),
                                                          random_state=0)

    assert_true(score_group == score)
    assert_true(pvalue_group == pvalue)

    # test with custom scoring object
    def custom_score(y_true, y_pred):
        return (((y_true == y_pred).sum() - (y_true != y_pred).sum()) /
                y_true.shape[0])

    scorer = make_scorer(custom_score)
    score, _, pvalue = permutation_test_score(svm,
                                              X,
                                              y,
                                              n_permutations=100,
                                              scoring=scorer,
                                              cv=cv,
                                              random_state=0)
    assert_almost_equal(score, .93, 2)
    assert_almost_equal(pvalue, 0.01, 3)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = permutation_test_score(svm,
                                                   X,
                                                   y,
                                                   n_permutations=30,
                                                   cv=cv,
                                                   scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
Ejemplo n.º 35
0
def test_incremental_variance_numerical_stability():
    # Test Youngs and Cramer incremental variance formulas.

    def np_var(A):
        return A.var(axis=0)

    # Naive one pass variance computation - not numerically stable
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
    def one_pass_var(X):
        n = X.shape[0]
        exp_x2 = (X ** 2).sum(axis=0) / n
        expx_2 = (X.sum(axis=0) / n) ** 2
        return exp_x2 - expx_2

    # Two-pass algorithm, stable.
    # We use it as a benchmark. It is not an online algorithm
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
    def two_pass_var(X):
        mean = X.mean(axis=0)
        Y = X.copy()
        return np.mean((Y - mean)**2, axis=0)

    # Naive online implementation
    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
    # This works only for chunks for size 1
    def naive_mean_variance_update(x, last_mean, last_variance,
                                   last_sample_count):
        updated_sample_count = (last_sample_count + 1)
        samples_ratio = last_sample_count / float(updated_sample_count)
        updated_mean = x / updated_sample_count + last_mean * samples_ratio
        updated_variance = last_variance * samples_ratio + \
            (x - last_mean) * (x - updated_mean) / updated_sample_count
        return updated_mean, updated_variance, updated_sample_count

    # We want to show a case when one_pass_var has error > 1e-3 while
    # _batch_mean_variance_update has less.
    tol = 200
    n_features = 2
    n_samples = 10000
    x1 = np.array(1e8, dtype=np.float64)
    x2 = np.log(1e-5, dtype=np.float64)
    A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64)
    A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64)
    A = np.vstack((A0, A1))

    # Older versions of numpy have different precision
    # In some old version, np.var is not stable
    if np.abs(np_var(A) - two_pass_var(A)).max() < 1e-6:
        stable_var = np_var
    else:
        stable_var = two_pass_var

    # Naive one pass var: >tol (=1063)
    assert_greater(np.abs(stable_var(A) - one_pass_var(A)).max(), tol)

    # Starting point for online algorithms: after A0

    # Naive implementation: >tol (436)
    mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
    for i in range(A1.shape[0]):
        mean, var, n = \
            naive_mean_variance_update(A1[i, :], mean, var, n)
    assert_equal(n, A.shape[0])
    # the mean is also slightly unstable
    assert_greater(np.abs(A.mean(axis=0) - mean).max(), 1e-6)
    assert_greater(np.abs(stable_var(A) - var).max(), tol)

    # Robust implementation: <tol (177)
    mean, var = A0[0, :], np.zeros(n_features)
    n = np.full(n_features, n_samples // 2, dtype=np.int32)
    for i in range(A1.shape[0]):
        mean, var, n = \
            _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])),
                                      mean, var, n)
    assert_array_equal(n, A.shape[0])
    assert_array_almost_equal(A.mean(axis=0), mean)
    assert_greater(tol, np.abs(stable_var(A) - var).max())
Ejemplo n.º 36
0
def test_minibatch_update_consistency():
    """Check that dense and sparse minibatch update give the same results"""
    rng = np.random.RandomState(42)
    old_centers = centers + rng.normal(size=centers.shape)

    new_centers = old_centers.copy()
    new_centers_csr = old_centers.copy()

    counts = np.zeros(new_centers.shape[0], dtype=np.int32)
    counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32)

    x_squared_norms = (X**2).sum(axis=1)
    x_squared_norms_csr = csr_row_norm_l2(X_csr, squared=True)

    buffer = np.zeros(centers.shape[1], dtype=np.double)
    buffer_csr = np.zeros(centers.shape[1], dtype=np.double)

    # extract a small minibatch
    X_mb = X[:10]
    X_mb_csr = X_csr[:10]
    x_mb_squared_norms = x_squared_norms[:10]
    x_mb_squared_norms_csr = x_squared_norms_csr[:10]

    # step 1: compute the dense minibatch update
    old_inertia, incremental_diff = _mini_batch_step(X_mb, x_mb_squared_norms,
                                                     new_centers, counts,
                                                     buffer, 1)
    assert_greater(old_inertia, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels, new_inertia = _labels_inertia(X_mb, x_mb_squared_norms,
                                          new_centers)
    assert_greater(new_inertia, 0.0)
    assert_less(new_inertia, old_inertia)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers - old_centers)**2)
    assert_almost_equal(incremental_diff, effective_diff)

    # step 2: compute the sparse minibatch update
    old_inertia_csr, incremental_diff_csr = _mini_batch_step(
        X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr,
        buffer_csr, 1)
    assert_greater(old_inertia_csr, 0.0)

    # compute the new inertia on the same batch to check that it decreased
    labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr,
                                                  x_mb_squared_norms_csr,
                                                  new_centers_csr)
    assert_greater(new_inertia_csr, 0.0)
    assert_less(new_inertia_csr, old_inertia_csr)

    # check that the incremental difference computation is matching the
    # final observed value
    effective_diff = np.sum((new_centers_csr - old_centers)**2)
    assert_almost_equal(incremental_diff_csr, effective_diff)

    # step 3: check that sparse and dense updates lead to the same results
    assert_array_equal(labels, labels_csr)
    assert_array_almost_equal(new_centers, new_centers_csr)
    assert_almost_equal(incremental_diff, incremental_diff_csr)
    assert_almost_equal(old_inertia, old_inertia_csr)
    assert_almost_equal(new_inertia, new_inertia_csr)
Ejemplo n.º 37
0
def test_score():
    km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42)
    s1 = km1.fit(X).score(X)
    km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42)
    s2 = km2.fit(X).score(X)
    assert_greater(s2, s1)
Ejemplo n.º 38
0
def test_assert_greater():
    assert_greater(1, 0)
    assert_raises(AssertionError, assert_greater, 0, 1)
Ejemplo n.º 39
0
def test_binnedstratifiedkfold_has_more_stable_distribution_moments_between_folds(
):
    """check if BinnedStratifiedKFold performs on average better than KFold in terms of
    lower between-fold variance of fold mean(y_test) and fold std(y_test)
    """
    binned_has_more_stable_std_list = []
    binned_has_more_stable_mean_list = []

    for trial in range(100):
        n_folds = 2 + int(10 * np.random.rand())
        y = np.random.randn(30)
        np.random.shuffle(y)
        ymeans_binned = []
        ystds_binned = []

        cv_bs = BinnedStratifiedKFold(n_folds=n_folds,
                                      shuffle=False,
                                      random_state=None)
        bskf = cv_bs.split(y)

        cv = KFold(n_folds=n_folds, shuffle=True, random_state=None)
        kf = cv.split(y)

        #bins = np.percentile(y, np.arange(n_folds))
        bins = np.array([np.percentile(y, q) for q in range(n_folds)])

        for train_index, test_index in bskf:
            y_test = y[test_index]
            ymeans_binned.append(y_test.mean())
            ystds_binned.append(y_test.std())
            hist_, _ = np.histogram(y[test_index], bins=bins)

            assert_true(all(abs(hist_ - np.mean(hist_)) <= 1),
                        msg="too ragged bins")

        ymeans_regular = []
        ystds_regular = []
        for train_index_reg, test_index_reg in kf:
            ymeans_regular.append(y[test_index_reg].mean())
            ystds_regular.append(y[test_index_reg].std())

        binned_has_more_stable_std = np.std(ystds_regular) > np.std(
            ystds_binned)
        binned_has_more_stable_std_list.append(binned_has_more_stable_std)

        binned_has_more_stable_mean = np.std(ymeans_regular) > np.std(
            ymeans_binned)
        binned_has_more_stable_mean_list.append(binned_has_more_stable_mean)

    binned_has_more_stable_std_fraction = np.mean(
        binned_has_more_stable_std_list)
    binned_has_more_stable_mean_fraction = np.mean(
        binned_has_more_stable_mean_list)

    assert_greater(binned_has_more_stable_std_fraction, 0.5)
    assert_greater(binned_has_more_stable_mean_fraction, 0.5)
    print(" std(y_test) of BinnedStratifiedKFold was more stable than "
          "one of KFold in\t%.2f%% cases" % \
          (100.0*binned_has_more_stable_std_fraction))
    print("mean(y_test) of BinnedStratifiedKFold was more stable than "
          "one of KFold in\t%.2f%% cases" % \
          (100.0*binned_has_more_stable_mean_fraction))
Ejemplo n.º 40
0
def test_permutation_score():
    iris = load_iris()
    X = iris.data
    X_sparse = coo_matrix(X)
    y = iris.target
    svm = SVC(kernel='linear')
    cv = cval.StratifiedKFold(y, 2)

    score, scores, pvalue = cval.permutation_test_score(svm,
                                                        X,
                                                        y,
                                                        cv=cv,
                                                        scoring="accuracy")
    assert_greater(score, 0.9)
    assert_almost_equal(pvalue, 0.0, 1)

    score_label, _, pvalue_label = cval.permutation_test_score(
        svm,
        X,
        y,
        cv=cv,
        scoring="accuracy",
        labels=np.ones(y.size),
        random_state=0)
    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # test with custom scoring object
    scorer = make_scorer(fbeta_score, beta=2)
    score_label, _, pvalue_label = cval.permutation_test_score(svm,
                                                               X,
                                                               y,
                                                               scoring=scorer,
                                                               cv=cv,
                                                               labels=np.ones(
                                                                   y.size),
                                                               random_state=0)
    assert_almost_equal(score_label, .97, 2)
    assert_almost_equal(pvalue_label, 0.01, 3)

    # check that we obtain the same results with a sparse representation
    svm_sparse = SVC(kernel='linear')
    cv_sparse = cval.StratifiedKFold(y, 2)
    score_label, _, pvalue_label = cval.permutation_test_score(
        svm_sparse,
        X_sparse,
        y,
        cv=cv_sparse,
        scoring="accuracy",
        labels=np.ones(y.size),
        random_state=0)

    assert_true(score_label == score)
    assert_true(pvalue_label == pvalue)

    # set random y
    y = np.mod(np.arange(len(y)), 3)

    score, scores, pvalue = cval.permutation_test_score(svm,
                                                        X,
                                                        y,
                                                        cv=cv,
                                                        scoring="accuracy")

    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)

    # test with deprecated interface
    with warnings.catch_warnings(record=True):
        score, scores, pvalue = cval.permutation_test_score(
            svm, X, y, score_func=accuracy_score, cv=cv)
    assert_less(score, 0.5)
    assert_greater(pvalue, 0.2)
Ejemplo n.º 41
0
def test_all_estimators():
    estimators = all_estimators(include_meta_estimators=True)

    # Meta sanity-check to make sure that the estimator introspection runs
    # properly
    assert_greater(len(estimators), 0)
Ejemplo n.º 42
0
def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)

    # Radius-based queries do not sort the result points and the order
    # depends on the method, the random_state and the dataset order. Therefore
    # we need to sort the results ourselves before performing any comparison.
    sorted_dists_exact = np.sort(distances_exact[0])
    sorted_dists_approx = np.sort(distances_approx[0])

    # Distances to exact neighbors are less than or equal to approximate
    # counterparts as the approximate radius query might have missed some
    # closer neighbors.
    assert_true(np.all(np.less_equal(sorted_dists_exact,
                                     sorted_dists_approx)))
Ejemplo n.º 43
0
def check_classifiers_train(name, Classifier):
    X_m, y_m = make_blobs(random_state=0)
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = StandardScaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]
    for (X, y) in [(X_m, y_m), (X_b, y_b)]:
        # catch deprecation warnings
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, n_features = X.shape
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name in ['BernoulliNB', 'MultinomialNB']:
            X -= X.min()
        set_fast_parameters(classifier)
        # raises error on malformed input for fit
        assert_raises(ValueError, classifier.fit, X, y[:-1])

        # fit
        classifier.fit(X, y)
        # with lists
        classifier.fit(X.tolist(), y.tolist())
        assert_true(hasattr(classifier, "classes_"))
        y_pred = classifier.predict(X)
        assert_equal(y_pred.shape, (n_samples,))
        # training set performance
        if name not in ['BernoulliNB', 'MultinomialNB']:
            assert_greater(accuracy_score(y, y_pred), 0.85)

        # raises error on malformed input for predict
        assert_raises(ValueError, classifier.predict, X.T)
        if hasattr(classifier, "decision_function"):
            try:
                # decision_function agrees with predict:
                decision = classifier.decision_function(X)
                if n_classes is 2:
                    assert_equal(decision.shape, (n_samples,))
                    dec_pred = (decision.ravel() > 0).astype(np.int)
                    assert_array_equal(dec_pred, y_pred)
                if (n_classes is 3
                        and not isinstance(classifier, BaseLibSVM)):
                    # 1on1 of LibSVM works differently
                    assert_equal(decision.shape, (n_samples, n_classes))
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)

                # raises error on malformed input
                assert_raises(ValueError,
                              classifier.decision_function, X.T)
                # raises error on malformed input for decision_function
                assert_raises(ValueError,
                              classifier.decision_function, X.T)
            except NotImplementedError:
                pass
        if hasattr(classifier, "predict_proba"):
            # predict_proba agrees with predict:
            y_prob = classifier.predict_proba(X)
            assert_equal(y_prob.shape, (n_samples, n_classes))
            assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
            # check that probas for all classes sum to one
            assert_array_almost_equal(np.sum(y_prob, axis=1),
                                      np.ones(n_samples))
            # raises error on malformed input
            assert_raises(ValueError, classifier.predict_proba, X.T)
            # raises error on malformed input for predict_proba
            assert_raises(ValueError, classifier.predict_proba, X.T)
Ejemplo n.º 44
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    estimators = all_estimators()
    classifiers = [(name, E) for name, E in estimators
                   if issubclass(E, ClassifierMixin)]
    iris = load_iris()
    X_m, y_m = iris.data, iris.target
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = Scaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]
    for (X, y) in [(X_m, y_m), (X_b, y_b)]:
        # do it once with binary, once with multiclass
        n_labels = len(np.unique(y))
        n_samples, n_features = X.shape
        for name, Clf in classifiers:
            if Clf in dont_test or Clf in meta_estimators:
                continue
            if Clf in [MultinomialNB, BernoulliNB]:
                # TODO also test these!
                continue
            # catch deprecation warnings
            with warnings.catch_warnings(record=True):
                clf = Clf()
            # raises error on malformed input for fit
            assert_raises(ValueError, clf.fit, X, y[:-1])

            # fit
            clf.fit(X, y)
            y_pred = clf.predict(X)
            assert_equal(y_pred.shape, (n_samples, ))
            # training set performance
            assert_greater(zero_one_score(y, y_pred), 0.78)

            # raises error on malformed input for predict
            assert_raises(ValueError, clf.predict, X.T)
            if hasattr(clf, "decision_function"):
                try:
                    # decision_function agrees with predict:
                    decision = clf.decision_function(X)
                    if n_labels is 2:
                        assert_equal(decision.ravel().shape, (n_samples, ))
                        dec_pred = (decision.ravel() > 0).astype(np.int)
                        assert_array_equal(dec_pred, y_pred)
                    if n_labels is 3 and not isinstance(clf, BaseLibSVM):
                        # 1on1 of LibSVM works differently
                        assert_equal(decision.shape, (n_samples, n_labels))
                        assert_array_equal(np.argmax(decision, axis=1), y_pred)

                    # raises error on malformed input
                    assert_raises(ValueError, clf.decision_function, X.T)
                    # raises error on malformed input for decision_function
                    assert_raises(ValueError, clf.decision_function, X.T)
                except NotImplementedError:
                    pass
            if hasattr(clf, "predict_proba"):
                try:
                    # predict_proba agrees with predict:
                    y_prob = clf.predict_proba(X)
                    assert_equal(y_prob.shape, (n_samples, n_labels))
                    # raises error on malformed input
                    assert_raises(ValueError, clf.predict_proba, X.T)
                    assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                    # raises error on malformed input for predict_proba
                    assert_raises(ValueError, clf.predict_proba, X.T)
                except NotImplementedError:
                    pass
Ejemplo n.º 45
0
def test_factor_analysis():
    # Test FactorAnalysis ability to recover the data covariance structure
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    assert_raises(ValueError, FactorAnalysis, svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    assert_raises(ValueError, fa_fail.fit, X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert_equal(X_t.shape, (n_samples, n_components))

        assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum())
        assert_almost_equal(fa.score_samples(X).mean(), fa.score(X))

        diff = np.all(np.diff(fa.loglike_))
        assert_greater(diff, 0., 'Log likelihood dif not increase')

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert_less(diff, 0.1, "Mean absolute difference is %f" % diff)
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        assert_raises(ValueError, fa.fit, X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))

    fa1.max_iter = 1
    fa1.verbose = True
    assert_warns(ConvergenceWarning, fa1.fit, X)

    # Test get_covariance and get_precision with n_components == n_features
    # with n_components < n_features and with n_components == 0
    for n_components in [0, 2, X.shape[1]]:
        fa.n_components = n_components
        fa.fit(X)
        cov = fa.get_covariance()
        precision = fa.get_precision()
        assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]),
                                  12)
Ejemplo n.º 46
0
def test_calibration():
    """Test calibration objects with isotonic and sigmoid"""
    n_samples = 100
    X, y = make_classification(n_samples=2 * n_samples,
                               n_features=6,
                               random_state=42)
    sample_weight = np.random.RandomState(seed=42).uniform(size=y.size)

    X -= X.min()  # MultinomialNB only allows positive X

    # split train and test
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test, y_test = X[n_samples:], y[n_samples:]

    # Naive-Bayes
    clf = MultinomialNB().fit(X_train, y_train, sample_weight=sw_train)
    prob_pos_clf = clf.predict_proba(X_test)[:, 1]

    pc_clf = CalibratedClassifierCV(clf, cv=y.size + 1)
    assert_raises(ValueError, pc_clf.fit, X, y)

    # Naive Bayes with calibration
    for this_X_train, this_X_test in [(X_train, X_test),
                                      (sparse.csr_matrix(X_train),
                                       sparse.csr_matrix(X_test))]:
        for method in ['isotonic', 'sigmoid']:
            pc_clf = CalibratedClassifierCV(clf, method=method, cv=2)
            # Note that this fit overwrites the fit on the entire training
            # set
            pc_clf.fit(this_X_train, y_train, sample_weight=sw_train)
            prob_pos_pc_clf = pc_clf.predict_proba(this_X_test)[:, 1]

            # Check that brier score has improved after calibration
            assert_greater(brier_score_loss(y_test, prob_pos_clf),
                           brier_score_loss(y_test, prob_pos_pc_clf))

            # Check invariance against relabeling [0, 1] -> [1, 2]
            pc_clf.fit(this_X_train, y_train + 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [-1, 1]
            pc_clf.fit(this_X_train, 2 * y_train - 1, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = pc_clf.predict_proba(this_X_test)[:, 1]
            assert_array_almost_equal(prob_pos_pc_clf,
                                      prob_pos_pc_clf_relabeled)

            # Check invariance against relabeling [0, 1] -> [1, 0]
            pc_clf.fit(this_X_train, (y_train + 1) % 2, sample_weight=sw_train)
            prob_pos_pc_clf_relabeled = \
                pc_clf.predict_proba(this_X_test)[:, 1]
            if method == "sigmoid":
                assert_array_almost_equal(prob_pos_pc_clf,
                                          1 - prob_pos_pc_clf_relabeled)
            else:
                # Isotonic calibration is not invariant against relabeling
                # but should improve in both cases
                assert_greater(
                    brier_score_loss(y_test, prob_pos_clf),
                    brier_score_loss((y_test + 1) % 2,
                                     prob_pos_pc_clf_relabeled))

        # check that calibration can also deal with regressors that have
        # a decision_function
        clf_base_regressor = CalibratedClassifierCV(Ridge())
        clf_base_regressor.fit(X_train, y_train)
        clf_base_regressor.predict(X_test)

        # Check failure cases:
        # only "isotonic" and "sigmoid" should be accepted as methods
        clf_invalid_method = CalibratedClassifierCV(clf, method="foo")
        assert_raises(ValueError, clf_invalid_method.fit, X_train, y_train)

        # base-estimators should provide either decision_function or
        # predict_proba (most regressors, for instance, should fail)
        clf_base_regressor = \
            CalibratedClassifierCV(RandomForestRegressor(), method="sigmoid")
        assert_raises(RuntimeError, clf_base_regressor.fit, X_train, y_train)
Ejemplo n.º 47
0
def test_check_accuracy_on_digits():
    # Non regression test to make sure that any further refactoring / optim
    # of the NB models do not harm the performance on a slightly non-linearly
    # separable dataset
    digits = load_digits()
    X, y = digits.data, digits.target
    binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8)
    X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]

    # Multinomial NB
    scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
    assert_greater(scores.mean(), 0.86)

    scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.94)

    # Bernoulli NB
    scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
    assert_greater(scores.mean(), 0.83)

    scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.92)

    # Gaussian NB
    scores = cross_val_score(GaussianNB(), X, y, cv=10)
    assert_greater(scores.mean(), 0.77)

    scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
    assert_greater(scores.mean(), 0.86)
print("Timimg exact rbf: \t\t", exact_spent_time)

rbf_transform = Fastfood(
    sigma=sigma,
    n_components=number_of_features_to_generate,
    tradeoff_mem_accuracy="mem",
    random_state=42,
)
_ = rbf_transform.fit(X)
fastfood_fast_vec_start = datetime.datetime.utcnow()
# Fastfood: approximate kernel mapping
_ = rbf_transform.transform(X)
_ = rbf_transform.transform(Y)
fastfood_fast_vec_end = datetime.datetime.utcnow()
fastfood_fast_vec_spent_time = fastfood_fast_vec_end - fastfood_fast_vec_start
print("Timimg fastfood fast vectorized: \t\t", fastfood_fast_vec_spent_time)

rks_rbf_transform = RBFSampler(
    gamma=gamma, n_components=number_of_features_to_generate, random_state=42
)
_ = rks_rbf_transform.fit(X)
rks_start = datetime.datetime.utcnow()
# Random Kitchens Sinks: approximate kernel mapping
_ = rks_rbf_transform.transform(X)
_ = rks_rbf_transform.transform(Y)
rks_end = datetime.datetime.utcnow()
rks_spent_time = rks_end - rks_start
print("Timimg rks: \t\t\t", rks_spent_time)

assert_greater(rks_spent_time, fastfood_fast_vec_spent_time)
Ejemplo n.º 49
0
def test_classifiers_train():
    # test if classifiers do something sensible on training set
    # also test all shapes / shape errors
    classifiers = all_estimators(type_filter='classifier')
    X_m, y_m = make_blobs(random_state=0)
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = StandardScaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]
    for (X, y) in [(X_m, y_m), (X_b, y_b)]:
        # do it once with binary, once with multiclass
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, n_features = X.shape
        for name, Classifier in classifiers:
            if name in dont_test:
                continue
            if name in ['MultinomialNB', 'BernoulliNB']:
                # TODO also test these!
                continue
            # catch deprecation warnings
            with warnings.catch_warnings(record=True):
                classifier = Classifier()
            # raises error on malformed input for fit
            assert_raises(ValueError, classifier.fit, X, y[:-1])

            # fit
            classifier.fit(X, y)
            assert_true(hasattr(classifier, "classes_"))
            y_pred = classifier.predict(X)
            assert_equal(y_pred.shape, (n_samples, ))
            # training set performance
            assert_greater(accuracy_score(y, y_pred), 0.85)

            # raises error on malformed input for predict
            assert_raises(ValueError, classifier.predict, X.T)
            if hasattr(classifier, "decision_function"):
                try:
                    # decision_function agrees with predict:
                    decision = classifier.decision_function(X)
                    if n_classes is 2:
                        assert_equal(decision.ravel().shape, (n_samples, ))
                        dec_pred = (decision.ravel() > 0).astype(np.int)
                        assert_array_equal(dec_pred, y_pred)
                    if (n_classes is 3
                            and not isinstance(classifier, BaseLibSVM)):
                        # 1on1 of LibSVM works differently
                        assert_equal(decision.shape, (n_samples, n_classes))
                        assert_array_equal(np.argmax(decision, axis=1), y_pred)

                    # raises error on malformed input
                    assert_raises(ValueError, classifier.decision_function,
                                  X.T)
                    # raises error on malformed input for decision_function
                    assert_raises(ValueError, classifier.decision_function,
                                  X.T)
                except NotImplementedError:
                    pass
            if hasattr(classifier, "predict_proba"):
                try:
                    # predict_proba agrees with predict:
                    y_prob = classifier.predict_proba(X)
                    assert_equal(y_prob.shape, (n_samples, n_classes))
                    assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
                    # check that probas for all classes sum to one
                    assert_array_almost_equal(np.sum(y_prob, axis=1),
                                              np.ones(n_samples))
                    # raises error on malformed input
                    assert_raises(ValueError, classifier.predict_proba, X.T)
                    # raises error on malformed input for predict_proba
                    assert_raises(ValueError, classifier.predict_proba, X.T)
                except NotImplementedError:
                    pass
Ejemplo n.º 50
0
def test_explained_variance():
    # Test sparse data
    svd_a_10_sp = TruncatedSVD(10, algorithm="arpack")
    svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_a_20_sp = TruncatedSVD(20, algorithm="arpack")
    svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_a_10_sp = svd_a_10_sp.fit_transform(X)
    X_trans_r_10_sp = svd_r_10_sp.fit_transform(X)
    X_trans_a_20_sp = svd_a_20_sp.fit_transform(X)
    X_trans_r_20_sp = svd_r_20_sp.fit_transform(X)

    # Test dense data
    svd_a_10_de = TruncatedSVD(10, algorithm="arpack")
    svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_a_20_de = TruncatedSVD(20, algorithm="arpack")
    svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_a_10_de = svd_a_10_de.fit_transform(X.toarray())
    X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray())
    X_trans_a_20_de = svd_a_20_de.fit_transform(X.toarray())
    X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray())

    # helper arrays for tests below
    svds = (svd_a_10_sp, svd_r_10_sp, svd_a_20_sp, svd_r_20_sp, svd_a_10_de,
            svd_r_10_de, svd_a_20_de, svd_r_20_de)
    svds_trans = (
        (svd_a_10_sp, X_trans_a_10_sp),
        (svd_r_10_sp, X_trans_r_10_sp),
        (svd_a_20_sp, X_trans_a_20_sp),
        (svd_r_20_sp, X_trans_r_20_sp),
        (svd_a_10_de, X_trans_a_10_de),
        (svd_r_10_de, X_trans_r_10_de),
        (svd_a_20_de, X_trans_a_20_de),
        (svd_r_20_de, X_trans_r_20_de),
    )
    svds_10_v_20 = (
        (svd_a_10_sp, svd_a_20_sp),
        (svd_r_10_sp, svd_r_20_sp),
        (svd_a_10_de, svd_a_20_de),
        (svd_r_10_de, svd_r_20_de),
    )
    svds_sparse_v_dense = (
        (svd_a_10_sp, svd_a_10_de),
        (svd_a_20_sp, svd_a_20_de),
        (svd_r_10_sp, svd_r_10_de),
        (svd_r_20_sp, svd_r_20_de),
    )

    # Assert the 1st component is equal
    for svd_10, svd_20 in svds_10_v_20:
        assert_array_almost_equal(
            svd_10.explained_variance_ratio_,
            svd_20.explained_variance_ratio_[:10],
            decimal=5,
        )

    # Assert that 20 components has higher explained variance than 10
    for svd_10, svd_20 in svds_10_v_20:
        assert_greater(
            svd_20.explained_variance_ratio_.sum(),
            svd_10.explained_variance_ratio_.sum(),
        )

    # Assert that all the values are greater than 0
    for svd in svds:
        assert_array_less(0.0, svd.explained_variance_ratio_)

    # Assert that total explained variance is less than 1
    for svd in svds:
        assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)

    # Compare sparse vs. dense
    for svd_sparse, svd_dense in svds_sparse_v_dense:
        assert_array_almost_equal(svd_sparse.explained_variance_ratio_,
                                  svd_dense.explained_variance_ratio_)

    # Test that explained_variance is correct
    for svd, transformed in svds_trans:
        total_variance = np.var(X.toarray(), axis=0).sum()
        variances = np.var(transformed, axis=0)
        true_explained_variance_ratio = variances / total_variance

        assert_array_almost_equal(
            svd.explained_variance_ratio_,
            true_explained_variance_ratio,
        )
Ejemplo n.º 51
0
def test_linear_svr_fit_intercept():
    reg = LinearSVR(random_state=0, fit_intercept=True)
    reg.fit(reg_dense, reg_target)
    assert_greater(reg.score(reg_dense, reg_target), 0.99)
 def test_prediction_scores(self):
     pred_scores = self.clf.decision_function(self.X_test)
     # check score shapes
     assert_equal(pred_scores.shape[0], self.X_test.shape[0])
     # check performance
     assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor)
Ejemplo n.º 53
0
def check_explicit_sparse_zeros(tree, max_depth=3, n_features=10):
    TreeEstimator = ALL_TREES[tree]

    # n_samples set n_feature to ease construction of a simultaneous
    # construction of a csr and csc matrix
    n_samples = n_features
    samples = np.arange(n_samples)

    # Generate X, y
    random_state = check_random_state(0)
    indices = []
    data = []
    offset = 0
    indptr = [offset]
    for i in range(n_features):
        n_nonzero_i = random_state.binomial(n_samples, 0.5)
        indices_i = random_state.permutation(samples)[:n_nonzero_i]
        indices.append(indices_i)
        data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1
        data.append(data_i)
        offset += n_nonzero_i
        indptr.append(offset)

    indices = np.concatenate(indices)
    data = np.array(np.concatenate(data), dtype=np.float32)
    X_sparse = csc_matrix((data, indices, indptr),
                          shape=(n_samples, n_features))
    X = X_sparse.toarray()
    X_sparse_test = csr_matrix((data, indices, indptr),
                               shape=(n_samples, n_features))
    X_test = X_sparse_test.toarray()
    y = random_state.randint(0, 3, size=(n_samples, ))

    # Ensure that X_sparse_test owns its data, indices and indptr array
    X_sparse_test = X_sparse_test.copy()

    # Ensure that we have explicit zeros
    assert_greater((X_sparse.data == 0.).sum(), 0)
    assert_greater((X_sparse_test.data == 0.).sum(), 0)

    # Perform the comparison
    d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
    s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)

    assert_tree_equal(
        d.tree_, s.tree_, "{0} with dense and sparse format gave different "
        "trees".format(tree))

    Xs = (X_test, X_sparse_test)
    for X1, X2 in product(Xs, Xs):
        assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2))
        assert_array_almost_equal(s.apply(X1), d.apply(X2))
        assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1))

        assert_array_almost_equal(
            s.tree_.decision_path(X1).toarray(),
            d.tree_.decision_path(X2).toarray())
        assert_array_almost_equal(
            s.decision_path(X1).toarray(),
            d.decision_path(X2).toarray())
        assert_array_almost_equal(
            s.decision_path(X1).toarray(),
            s.tree_.decision_path(X1).toarray())

        assert_array_almost_equal(s.predict(X1), d.predict(X2))

        if tree in CLF_TREES:
            assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2))
Ejemplo n.º 54
0
def test_multiclass_sgd():
    clf = SGDClassifier(random_state=0)
    clf.fit(mult_dense, mult_target)
    assert_greater(clf.score(mult_dense, mult_target), 0.80)
    assert_equal(list(clf.classes_), [0, 1, 2])
Ejemplo n.º 55
0
def test_lgmlvq_iris():
    check_estimator(LgmlvqModel)
    model = LgmlvqModel()
    model.fit(iris.data, iris.target)
    assert_greater(model.score(iris.data, iris.target), 0.95)

    assert_raise_message(ValueError, 'regularization must be a positive float',
                         LgmlvqModel(regularization=-1.0).fit, iris.data,
                         iris.target)
    assert_raise_message(ValueError,
                         'length of regularization'
                         ' must be number of prototypes',
                         LgmlvqModel(regularization=[-1.0]).fit, iris.data,
                         iris.target)
    assert_raise_message(ValueError,
                         'length of regularization must be number of classes',
                         LgmlvqModel(regularization=[-1.0],
                                     classwise=True).fit, iris.data,
                         iris.target)
    assert_raise_message(ValueError, 'initial matrices must be a list',
                         LgmlvqModel(initial_matrices=np.array(
                             [[1, 2], [3, 4], [5, 6]])).fit, iris.data,
                         iris.target)
    assert_raise_message(ValueError, 'length of matrices wrong',
                         LgmlvqModel(
                             initial_matrices=[[[1, 2], [3, 4], [5, 6]]]).fit,
                         iris.data, iris.target)
    assert_raise_message(ValueError, 'each matrix should have',
                         LgmlvqModel(
                             initial_matrices=[[[1]], [[1]], [[1]]]).fit,
                         iris.data, iris.target)
    assert_raise_message(ValueError, 'length of matrices wrong',
                         LgmlvqModel(initial_matrices=[[[1, 2, 3]]],
                                     classwise=True).fit, iris.data,
                         iris.target)
    assert_raise_message(ValueError, 'each matrix should have',
                         LgmlvqModel(initial_matrices=[[[1]], [[1]], [[1]]],
                                     classwise=True).fit, iris.data,
                         iris.target)
    assert_raise_message(ValueError, 'classwise must be a boolean',
                         LgmlvqModel(classwise="a").fit, iris.data,
                         iris.target)
    assert_raise_message(ValueError, 'dim must be a list of positive ints',
                         LgmlvqModel(dim=[-1]).fit, iris.data, iris.target)
    assert_raise_message(ValueError, 'dim length must be number of prototypes',
                         LgmlvqModel(dim=[1, 1]).fit, iris.data, iris.target)
    assert_raise_message(ValueError, 'dim length must be number of classes',
                         LgmlvqModel(dim=[1, 1], classwise=True).fit,
                         iris.data, iris.target)
    LgmlvqModel(classwise=True, dim=[1], prototypes_per_class=2).fit(
        iris.data, iris.target)

    model = LgmlvqModel(regularization=0.1)
    model.fit(iris.data, iris.target)

    model = LgmlvqModel(initial_prototypes=[[0, 2, 1], [1, 6, 2]],
                        initial_matrices=[np.ones([2, 2]), np.ones([2, 2])],
                        dim=[2, 2])
    x = np.array([[0, 0], [0, 4], [1, 4], [1, 8]])
    y = np.array([1, 1, 2, 2])
    model.fit(x, y)
Ejemplo n.º 56
0
def test_linear_svr():
    reg = LinearSVR(random_state=0)
    reg.fit(reg_dense, reg_target)
    assert_greater(reg.score(reg_dense, reg_target), 0.99)
Ejemplo n.º 57
0
def test_perceptron_accuracy():
    for data in (X, X_csr):
        clf = Perceptron(max_iter=100, tol=None, shuffle=False)
        clf.fit(data, y)
        score = clf.score(data, y)
        assert_greater(score, 0.7)
Ejemplo n.º 58
0
def test_lml_improving(kernel):
    # Test that hyperparameter-tuning improves log-marginal likelihood.
    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)
    assert_greater(gpr.log_marginal_likelihood(gpr.kernel_.theta),
                   gpr.log_marginal_likelihood(kernel.theta))
def check_classifiers_train(name, classifier_orig, readonly_memmap=False):
    X_m, y_m = make_blobs(n_samples=300, random_state=0)
    X_m, y_m = shuffle(X_m, y_m, random_state=7)
    X_m = StandardScaler().fit_transform(X_m)
    # generate binary problem from multi-class one
    y_b = y_m[y_m != 2]
    X_b = X_m[y_m != 2]

    if readonly_memmap:
        X_b, y_b = create_memmap_backed_data([X_b, y_b])

    for (X, y) in [(X_b, y_b)]:
        classes = np.unique(y)
        n_classes = len(classes)
        n_samples, _ = X.shape
        classifier = clone(classifier_orig)
        X = pairwise_estimator_convert_X(X, classifier)
        y = multioutput_estimator_convert_y_2d(classifier, y)

        set_random_state(classifier)
        # raises error on malformed input for fit
        with assert_raises(ValueError,
                           msg="The classifier {} does not "
                           "raise an error when incorrect/malformed input "
                           "data for fit is passed. The number of training "
                           "examples is not the same as the number of labels. "
                           "Perhaps use check_X_y in fit.".format(name)):
            classifier.fit(X, y[:-1])

        # fit
        classifier.fit(X, y)
        # with lists
        classifier.fit(X.tolist(), y.tolist())
        assert hasattr(classifier, "classes_")
        y_pred = classifier.predict(X)

        assert_equal(y_pred.shape, (n_samples, ))
        # training set performance
        assert_greater(accuracy_score(y, y_pred), 0.83)

        # raises error on malformed input for predict
        msg = ("The classifier {} does not raise an error when the number of "
               "features in {} is different from the number of features in "
               "fit.")

        with assert_raises(ValueError, msg=msg.format(name, "predict")):
            classifier.predict(X.T)
        if hasattr(classifier, "decision_function"):
            try:
                # decision_function agrees with predict
                decision = classifier.decision_function(X)
                if n_classes == 2:
                    assert_equal(decision.shape, (n_samples, 1))
                    dec_pred = (decision.ravel() > 0).astype(np.int)
                    assert_array_equal(dec_pred, y_pred)
                else:
                    assert_equal(decision.shape, (n_samples, n_classes))
                    assert_array_equal(np.argmax(decision, axis=1), y_pred)

                # raises error on malformed input for decision_function
                with assert_raises(ValueError,
                                   msg=msg.format(name, "decision_function")):
                    classifier.decision_function(X.T)
            except NotImplementedError:
                pass

        if hasattr(classifier, "predict_proba"):
            # predict_proba agrees with predict
            y_prob = classifier.predict_proba(X)
            assert_equal(y_prob.shape, (n_samples, n_classes))
            assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
            # check that probas for all classes sum to one
            assert_array_almost_equal(np.sum(y_prob, axis=1),
                                      np.ones(n_samples))
            # raises error on malformed input for predict_proba
            with assert_raises(ValueError,
                               msg=msg.format(name, "predict_proba")):
                classifier.predict_proba(X.T)
            if hasattr(classifier, "predict_log_proba"):
                # predict_log_proba is a transformation of predict_proba
                y_log_prob = classifier.predict_log_proba(X)
                assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9)
                assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
Ejemplo n.º 60
0
def test_whitening():
    # Check that PCA output has unit-variance
    rng = np.random.RandomState(0)
    n_samples = 100
    n_features = 80
    n_components = 30
    rank = 50

    # some low rank data with correlated features
    X = np.dot(
        rng.randn(n_samples, rank),
        np.dot(np.diag(np.linspace(10.0, 1.0, rank)),
               rng.randn(rank, n_features)),
    )
    # the component-wise variance of the first 50 features is 3 times the
    # mean component-wise variance of the remaining 30 features
    X[:, :50] *= 3

    assert_equal(X.shape, (n_samples, n_features))

    # the component-wise variance is thus highly varying:
    assert_greater(X.std(axis=0).std(), 43.8)
    dX = da.from_array(X, chunks=(50, n_features))

    for solver, copy in product(solver_list, (True, False)):
        # whiten the data while projecting to the lower dim subspace
        X_ = dX.copy()  # make sure we keep an original across iterations.
        pca = dd.PCA(
            n_components=n_components,
            whiten=True,
            copy=copy,
            svd_solver=solver,
            random_state=0,
            iterated_power=4,
        )
        # test fit_transform
        X_whitened = pca.fit_transform(X_.copy())
        assert_equal(X_whitened.shape, (n_samples, n_components))
        # X_whitened2 = pca.transform(X_)
        # XXX: These differ for randomized.
        # assert_eq(X_whitened.compute(), X_whitened2.compute(),
        #           atol=tol, rtol=tol)

        assert_almost_equal(X_whitened.std(ddof=1, axis=0),
                            np.ones(n_components),
                            decimal=6)
        assert_almost_equal(X_whitened.mean(axis=0), np.zeros(n_components))

        X_ = dX.copy()
        pca = dd.PCA(
            n_components=n_components,
            whiten=False,
            copy=copy,
            svd_solver=solver,
            random_state=0,
        ).fit(X_)
        X_unwhitened = pca.transform(X_)
        assert_equal(X_unwhitened.shape, (n_samples, n_components))

        # in that case the output components still have varying variances
        assert_almost_equal(X_unwhitened.std(axis=0).std(), 74.1, 1)