Example #1
0
def test_all_points_mem_vec_same_clusters():
    """
    Verify membership vector for training set produces same n_clusters
        as clusterer
    """
    # Given a flat clustering trained for n_clusters picked by HDBSCAN,
    n_clusters_fit = None
    clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit)

    # When all_points_membership_vectors_flat is called,
    memberships = all_points_membership_vectors_flat(clusterer)

    # Then the number of clusters in memberships matches those of clusterer,
    assert (memberships.shape[1] == n_clusters_from_labels(clusterer.labels_))
    # and the number of points should equal those in the training set
    assert (len(memberships) == len(X))
    # and all probabilities are <= 1.
    assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14)

    # ========================================
    # Given a flat clustering for a specified n_clusters,
    n_clusters_fit = n_clusters_from_labels(clusterer.labels_) - 2
    clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit)

    # When all_points_membership_vectors_flat is called,
    memberships = all_points_membership_vectors_flat(clusterer)

    # Then the number of clusters in memberships matches those of clusterer,
    assert (memberships.shape[1] == n_clusters_from_labels(clusterer.labels_))
    # and the number of points should equal those in the training set
    assert (len(memberships) == len(X))
    # and all probabilities are <= 1.
    assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14)
    return
Example #2
0
def test_iris():
    # Check consistency on dataset iris.
    classes = np.unique(iris.target)
    clf_samme = prob_samme = None

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)
        clf.fit(iris.data, iris.target)

        assert_array_equal(classes, clf.classes_)
        proba = clf.predict_proba(iris.data)
        if alg == "SAMME":
            clf_samme = clf
            prob_samme = proba
        assert proba.shape[1] == len(classes)
        assert clf.decision_function(iris.data).shape[1] == len(classes)

        score = clf.score(iris.data, iris.target)
        assert score > 0.9, "Failed with algorithm %s and score = %f" % \
            (alg, score)

        # Check we used multiple estimators
        assert len(clf.estimators_) > 1
        # Check for distinct random states (see issue #7408)
        assert (len(set(est.random_state for est in clf.estimators_)) ==
                     len(clf.estimators_))

    # Somewhat hacky regression test: prior to
    # ae7adc880d624615a34bafdb1d75ef67051b8200,
    # predict_proba returned SAMME.R values for SAMME.
    clf_samme.algorithm = "SAMME.R"
    assert_array_less(0,
                      np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
Example #3
0
def test_solution_inside_bounds(kernel):
    # Test that hyperparameter-optimization remains in bounds#
    gpr = GaussianProcessRegressor(kernel=kernel).fit(X, y)

    bounds = gpr.kernel_.bounds
    max_ = np.finfo(gpr.kernel_.theta.dtype).max
    tiny = 1e-10
    bounds[~np.isfinite(bounds[:, 1]), 1] = max_

    assert_array_less(bounds[:, 0], gpr.kernel_.theta + tiny)
    assert_array_less(gpr.kernel_.theta, bounds[:, 1] + tiny)
Example #4
0
def test_std_bayesian_ridge_ard_with_constant_input():
    # Test BayesianRidge and ARDRegression standard dev. for edge case of
    # constant target vector
    # The standard dev. should be relatively small (< 0.01 is tested here)
    n_samples = 4
    n_features = 5
    random_state = check_random_state(42)
    constant_value = random_state.rand()
    X = random_state.random_sample((n_samples, n_features))
    y = np.full(n_samples, constant_value,
                dtype=np.array(constant_value).dtype)
    expected_upper_boundary = 0.01

    for clf in [BayesianRidge(), ARDRegression()]:
        _, y_std = clf.fit(X, y).predict(X, return_std=True)
        assert_array_less(y_std, expected_upper_boundary)
Example #5
0
def test_graphical_lasso(random_state=0):
    # Sample area_data from a sparse multivariate normal
    dim = 20
    n_samples = 100
    random_state = check_random_state(random_state)
    prec = make_sparse_spd_matrix(dim, alpha=.95, random_state=random_state)
    cov = linalg.inv(prec)
    X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
    emp_cov = empirical_covariance(X)

    for alpha in (0., .1, .25):
        covs = dict()
        icovs = dict()
        for method in ('cd', 'lars'):
            cov_, icov_, costs = graphical_lasso(emp_cov,
                                                 return_costs=True,
                                                 alpha=alpha,
                                                 mode=method)
            covs[method] = cov_
            icovs[method] = icov_
            costs, dual_gap = np.array(costs).T
            # Check that the costs always decrease (doesn't hold if alpha == 0)
            if not alpha == 0:
                assert_array_less(np.diff(costs), 0)
        # Check that the 2 approaches give similar results
        assert_array_almost_equal(covs['cd'], covs['lars'], decimal=4)
        assert_array_almost_equal(icovs['cd'], icovs['lars'], decimal=4)

    # Smoke test the estimator
    model = GraphicalLasso(alpha=.25).fit(X)
    model.score(X)
    assert_array_almost_equal(model.covariance_, covs['cd'], decimal=4)
    assert_array_almost_equal(model.covariance_, covs['lars'], decimal=4)

    # For a centered matrix, assume_centered could be chosen True or False
    # Check that this returns indeed the same result for centered area_data
    Z = X - X.mean(0)
    precs = list()
    for assume_centered in (False, True):
        prec_ = GraphicalLasso(
            assume_centered=assume_centered).fit(Z).precision_
        precs.append(prec_)
    assert_array_almost_equal(precs[0], precs[1])
def test_explained_variance(X_sparse, kind, n_components, solver):
    X = X_sparse if kind == "sparse" else X_sparse.toarray()
    svd = TruncatedSVD(n_components, algorithm=solver)
    X_tr = svd.fit_transform(X)
    # Assert that all the values are greater than 0
    assert_array_less(0.0, svd.explained_variance_ratio_)

    # Assert that total explained variance is less than 1
    assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)

    # Test that explained_variance is correct
    total_variance = np.var(X_sparse.toarray(), axis=0).sum()
    variances = np.var(X_tr, axis=0)
    true_explained_variance_ratio = variances / total_variance

    assert_allclose(
        svd.explained_variance_ratio_,
        true_explained_variance_ratio,
    )
Example #7
0
def test_mem_vec_diff_clusters():
    """
    Verify membership vector produces as many clusters as requested
    """
    # Ignore user warnings in this function
    warnings.filterwarnings("ignore", category=UserWarning)

    # Given a flat clustering trained for n_clusters picked by HDBSCAN,
    n_clusters_fit = None
    clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit)
    n_clusters_fitted = n_clusters_from_labels(clusterer.labels_)

    # When membership_vector_flat is called with new data for some n_clusters,
    n_clusters_predict = n_clusters_fitted + 3
    memberships = membership_vector_flat(clusterer,
                                         X_test,
                                         n_clusters=n_clusters_predict)

    # Then the number of clusters in memberships should be as requested,
    assert (memberships.shape[1] == n_clusters_predict)
    # and the number of points should equal those in the test set
    assert (len(memberships) == len(X_test))
    # and all probabilities are <= 1.
    assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14)

    # ========================================
    # Given a flat clustering for a specified n_clusters,
    n_clusters_fit = n_clusters_from_labels(clusterer.labels_) + 2
    clusterer = HDBSCAN_flat(X, n_clusters=n_clusters_fit)

    # When membership_vector_flat is called with new data for some n_clusters,
    n_clusters_predict = n_clusters_fit + 3
    memberships = membership_vector_flat(clusterer,
                                         X_test,
                                         n_clusters=n_clusters_predict)

    # Then the number of clusters in memberships should be as requested,
    assert (memberships.shape[1] == n_clusters_predict)
    # and the number of points should equal those in the test set
    assert (len(memberships) == len(X_test))
    # and all probabilities are <= 1.
    assert_array_less(memberships, np.ones(memberships.shape) + 1.e-14)
    return
Example #8
0
def test_approx_predict_same_clusters():
    """
    Verify that approximate_predict_flat produces as many clusters as clusterer
    """
    # Given a flat clustering trained for some n_clusters,
    n_clusters = 5
    clusterer = HDBSCAN_flat(X, cluster_selection_method='eom',
                             n_clusters=n_clusters)

    # When using approximate_predict_flat without specifying n_clusters,
    labels_flat, proba_flat = approximate_predict_flat(
                                    clusterer, X_test, n_clusters=None)

    # Then, the number of clusters produced must match the original n_clusters
    n_clusters_out = n_clusters_from_labels(labels_flat)
    assert(n_clusters_out == n_clusters)
    # and all probabilities are <= 1.
    assert_array_less(proba_flat, np.ones(len(proba_flat))+1.e-14)
    return
Example #9
0
def test_approx_predict_diff_clusters():
    """
    Verify that approximate_predict_flat produces as many clusters as asked
    """
    # Given a flat clustering trained for some n_clusters,
    n_clusters_fit = 5
    clusterer = HDBSCAN_flat(X,
                             cluster_selection_method='eom',
                             n_clusters=n_clusters_fit,
                             prediction_data=True)

    # When using approximate_predict_flat with specified n_clusters,
    n_clusters_predict = 3
    labels_flat, proba_flat = approximate_predict_flat(
        clusterer, X_test, n_clusters=n_clusters_predict)

    # Then, the requested number of clusters must be produced
    n_clusters_out = n_clusters_from_labels(labels_flat)
    assert (n_clusters_out == n_clusters_predict)
    # and all probabilities are <= 1.
    assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14)

    # When using approximate_predict_flat with more clusters
    #   than 'eom' can handle,
    n_clusters_predict = 12
    with warnings.catch_warnings(record=True) as w:
        labels_flat, proba_flat = approximate_predict_flat(
            clusterer, X_test, n_clusters=n_clusters_predict)
        # Then, a warning is raised saying 'eom' can't get this clustering,
        assert len(w) > 0
        assert issubclass(w[-1].category, UserWarning)
        assert "Cannot predict" in str(w[-1].message)
    # But the requested number of clusters must still be produced using 'leaf'
    n_clusters_out = n_clusters_from_labels(labels_flat)
    assert (n_clusters_out == n_clusters_predict)
    # and all probabilities are <= 1.
    assert_array_less(proba_flat, np.ones(len(proba_flat)) + 1.e-14)
    return
Example #10
0
def test_explained_variance(setup):
    # Test sparse data
    svd_r_10_sp = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_r_20_sp = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_r_10_sp = svd_r_10_sp.fit_transform(X)
    X_trans_r_20_sp = svd_r_20_sp.fit_transform(X)

    # Test dense data
    svd_r_10_de = TruncatedSVD(10, algorithm="randomized", random_state=42)
    svd_r_20_de = TruncatedSVD(20, algorithm="randomized", random_state=42)
    X_trans_r_10_de = svd_r_10_de.fit_transform(X.toarray())
    X_trans_r_20_de = svd_r_20_de.fit_transform(X.toarray())

    # helper arrays for tests below
    svds = (svd_r_10_sp, svd_r_20_sp, svd_r_10_de, svd_r_20_de)
    svds_trans = (
        (svd_r_10_sp, X_trans_r_10_sp),
        (svd_r_20_sp, X_trans_r_20_sp),
        (svd_r_10_de, X_trans_r_10_de),
        (svd_r_20_de, X_trans_r_20_de),
    )
    svds_10_v_20 = (
        (svd_r_10_sp, svd_r_20_sp),
        (svd_r_10_de, svd_r_20_de),
    )
    svds_sparse_v_dense = (
        (svd_r_10_sp, svd_r_10_de),
        (svd_r_20_sp, svd_r_20_de),
    )

    # Assert the 1st component is equal
    for svd_10, svd_20 in svds_10_v_20:
        assert_array_almost_equal(
            svd_10.explained_variance_ratio_.to_numpy(),
            svd_20.explained_variance_ratio_[:10].to_numpy(),
            decimal=4,
        )

    # Assert that 20 components has higher explained variance than 10
    for svd_10, svd_20 in svds_10_v_20:
        assert svd_20.explained_variance_ratio_.sum().to_numpy(
        ) > svd_10.explained_variance_ratio_.sum().to_numpy()

    # Assert that all the values are greater than 0
    for svd in svds:
        assert_array_less(0.0, svd.explained_variance_ratio_.to_numpy())

    # Assert that total explained variance is less than 1
    for svd in svds:
        assert_array_less(svd.explained_variance_ratio_.sum().to_numpy(), 1.0)

    # Compare sparse vs. dense
    for svd_sparse, svd_dense in svds_sparse_v_dense:
        assert_array_almost_equal(
            svd_sparse.explained_variance_ratio_.to_numpy(),
            svd_dense.explained_variance_ratio_.to_numpy())

    # Test that explained_variance is correct
    for svd, transformed in svds_trans:
        total_variance = mt.var(X.toarray(), axis=0).sum().to_numpy()
        variances = mt.var(transformed, axis=0)
        true_explained_variance_ratio = variances / total_variance

        assert_array_almost_equal(
            svd.explained_variance_ratio_.to_numpy(),
            true_explained_variance_ratio.to_numpy(),
        )