def test_sag_multiclass_computed_correctly(): """tests if the multiclass classifier is computed correctly""" alpha = .1 n_samples = 20 tol = .00001 max_iter = 40 fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class='ovr') clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha, dloss=log_dloss, n_iter=max_iter, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha, dloss=log_dloss, n_iter=max_iter, sparse=True, fit_intercept=fit_intercept) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def test_non_consecutive_labels(): # regression tests for labels with gaps h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 2, 2, 2], [0, 1, 0, 1, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) h, c, v = homogeneity_completeness_v_measure([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(h, 0.67, 2) assert_almost_equal(c, 0.42, 2) assert_almost_equal(v, 0.52, 2) ari_1 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) ari_2 = adjusted_rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(ari_1, 0.24, 2) assert_almost_equal(ari_2, 0.24, 2) ri_1 = rand_score([0, 0, 0, 1, 1, 1], [0, 1, 0, 1, 2, 2]) ri_2 = rand_score([0, 0, 0, 1, 1, 1], [0, 4, 0, 4, 2, 2]) assert_almost_equal(ri_1, 0.66, 2) assert_almost_equal(ri_2, 0.66, 2)
def test_entropy(): ent = entropy([0, 0, 42.]) assert_almost_equal(ent, 0.6365141, 5) assert_almost_equal(entropy([]), 1)
def check_randomized_svd_low_rank(dtype): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 n_features = 500 rank = 5 k = 10 decimal = 5 if dtype == np.float32 else 7 dtype = np.dtype(dtype) # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0).astype(dtype, copy=False) assert X.shape == (n_samples, n_features) # compute the singular values of X using the slow exact method U, s, Vt = linalg.svd(X, full_matrices=False) # Convert the singular values to the specific dtype U = U.astype(dtype, copy=False) s = s.astype(dtype, copy=False) Vt = Vt.astype(dtype, copy=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method Ua, sa, Va = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # If the input dtype is float, then the output dtype is float of the # same bit size (f32 is not upcast to f64) # But if the input dtype is int, the output dtype is float64 if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype == np.float64 assert sa.dtype == np.float64 assert Va.dtype == np.float64 assert Ua.shape == (n_samples, k) assert sa.shape == (k, ) assert Va.shape == (k, n_features) # ensure that the singular values of both methods are equal up to the # real rank of the matrix assert_almost_equal(s[:k], sa, decimal=decimal) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], Vt[:k, :]), np.dot(Ua, Va), decimal=decimal) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) if dtype.kind == 'f': assert Ua.dtype == dtype assert sa.dtype == dtype assert Va.dtype == dtype else: assert Ua.dtype.kind == 'f' assert sa.dtype.kind == 'f' assert Va.dtype.kind == 'f' assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
def test_lml_precomputed(kernel): # Test that lml of optimized kernel is stored correctly. gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y) assert_almost_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta), gpc.log_marginal_likelihood(), 7)
def test_ledoit_wolf(): # Tests LedoitWolf module on a simple dataset. # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) lw = LedoitWolf(assume_centered=True) lw.fit(X_centered) shrinkage_ = lw.shrinkage_ score_ = lw.score(X_centered) assert_almost_equal( ledoit_wolf_shrinkage(X_centered, assume_centered=True), shrinkage_ ) assert_almost_equal( ledoit_wolf_shrinkage(X_centered, assume_centered=True, block_size=6), shrinkage_, ) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf( X_centered, assume_centered=True ) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf(assume_centered=True) lw.fit(X_1d) lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d, assume_centered=True) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False, assume_centered=True) lw.fit(X_centered) assert_almost_equal(lw.score(X_centered), score_, 4) assert lw.precision_ is None # Same tests without assuming centered data # test shrinkage coeff on a simple data set lw = LedoitWolf() lw.fit(X) assert_almost_equal(lw.shrinkage_, shrinkage_, 4) assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X)) assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1]) assert_almost_equal(lw.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) # compare estimates given by LW and ShrunkCovariance scov = ShrunkCovariance(shrinkage=lw.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, lw.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) lw = LedoitWolf() lw.fit(X_1d) lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d) assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4) assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4) # test with one sample # warning should be raised when using only 1 sample X_1sample = np.arange(5).reshape(1, 5) lw = LedoitWolf() warn_msg = "Only one sample available. You may want to reshape your data array" with pytest.warns(UserWarning, match=warn_msg): lw.fit(X_1sample) assert_array_almost_equal(lw.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test shrinkage coeff on a simple data set (without saving precision) lw = LedoitWolf(store_precision=False) lw.fit(X) assert_almost_equal(lw.score(X), score_, 4) assert lw.precision_ is None
def test_covariance(): # Tests Covariance module on a simple dataset. # test covariance fit from data cov = EmpiricalCovariance() cov.fit(X) emp_cov = empirical_covariance(X) assert_array_almost_equal(emp_cov, cov.covariance_, 4) assert_almost_equal(cov.error_norm(emp_cov), 0) assert_almost_equal(cov.error_norm(emp_cov, norm="spectral"), 0) assert_almost_equal(cov.error_norm(emp_cov, norm="frobenius"), 0) assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0) assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0) with pytest.raises(NotImplementedError): cov.error_norm(emp_cov, norm="foo") # Mahalanobis distances computation test mahal_dist = cov.mahalanobis(X) assert np.amin(mahal_dist) > 0 # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) cov = EmpiricalCovariance() cov.fit(X_1d) assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0) assert_almost_equal(cov.error_norm(empirical_covariance(X_1d), norm="spectral"), 0) # test with one sample # Create X with 1 sample and 5 features X_1sample = np.arange(5).reshape(1, 5) cov = EmpiricalCovariance() warn_msg = "Only one sample available. You may want to reshape your data array" with pytest.warns(UserWarning, match=warn_msg): cov.fit(X_1sample) assert_array_almost_equal(cov.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test integer type X_integer = np.asarray([[0, 1], [1, 0]]) result = np.asarray([[0.25, -0.25], [-0.25, 0.25]]) assert_array_almost_equal(empirical_covariance(X_integer), result) # test centered case cov = EmpiricalCovariance(assume_centered=True) cov.fit(X) assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def test_calibration_curve(): """Check calibration_curve function""" y_true = np.array([0, 0, 0, 1, 1, 1]) y_pred = np.array([0., 0.1, 0.2, 0.8, 0.9, 1.]) prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=2) prob_true_unnormalized, prob_pred_unnormalized = \ calibration_curve(y_true, y_pred * 2, n_bins=2, normalize=True) assert len(prob_true) == len(prob_pred) assert len(prob_true) == 2 assert_almost_equal(prob_true, [0, 1]) assert_almost_equal(prob_pred, [0.1, 0.9]) assert_almost_equal(prob_true, prob_true_unnormalized) assert_almost_equal(prob_pred, prob_pred_unnormalized) # probabilities outside [0, 1] should not be accepted when normalize # is set to False assert_raises(ValueError, calibration_curve, [1.1], [-0.1], normalize=False) # test that quantiles work as expected y_true2 = np.array([0, 0, 0, 0, 1, 1]) y_pred2 = np.array([0., 0.1, 0.2, 0.5, 0.9, 1.]) prob_true_quantile, prob_pred_quantile = calibration_curve( y_true2, y_pred2, n_bins=2, strategy='quantile') assert len(prob_true_quantile) == len(prob_pred_quantile) assert len(prob_true_quantile) == 2 assert_almost_equal(prob_true_quantile, [0, 2 / 3]) assert_almost_equal(prob_pred_quantile, [0.1, 0.8]) # Check that error is raised when invalid strategy is selected assert_raises(ValueError, calibration_curve, y_true2, y_pred2, strategy='percentile')
def test_sparse_random_matrix(): # Check some statical properties of sparse random matrix n_components = 100 n_features = 500 for density in [0.3, 1.]: s = 1 / density A = _sparse_random_matrix(n_components, n_features, density=density, random_state=0) A = densify(A) # Check possible values values = np.unique(A) assert np.sqrt(s) / np.sqrt(n_components) in values assert -np.sqrt(s) / np.sqrt(n_components) in values if density == 1.0: assert np.size(values) == 2 else: assert 0. in values assert np.size(values) == 3 # Check that the random matrix follow the proper distribution. # Let's say that each element of a_{ij} of A is taken from # # - -sqrt(s) / sqrt(n_components) with probability 1 / 2s # - 0 with probability 1 - 1 / s # - +sqrt(s) / sqrt(n_components) with probability 1 / 2s # assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2) assert_almost_equal(np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2) assert_almost_equal(np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2) assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2) assert_almost_equal(np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2) assert_almost_equal(np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1), (1 - 1 / (2 * s)) * 1 / (2 * s), decimal=2)
def test_kernel_diag(kernel): # Test that diag method of kernel returns consistent results. K_call_diag = np.diag(kernel(X)) K_diag = kernel.diag(X) assert_almost_equal(K_call_diag, K_diag, 5)
def test_kernel_stationary(kernel): # Test stationarity of kernels. K = kernel(X, X + 1) assert_almost_equal(K[0, 0], np.diag(K))
def test_auto_vs_cross(kernel): # Auto-correlation and cross-correlation should be consistent. K_auto = kernel(X) K_cross = kernel(X, X) assert_almost_equal(K_auto, K_cross, 5)
def test_warm_start(seed): random_state = seed rng = np.random.RandomState(random_state) n_samples, n_features, n_components = 500, 2, 2 X = rng.rand(n_samples, n_features) # Assert the warm_start give the same result for the same number of iter g = GaussianMixture( n_components=n_components, n_init=1, max_iter=2, reg_covar=0, random_state=random_state, warm_start=False, ) h = GaussianMixture( n_components=n_components, n_init=1, max_iter=1, reg_covar=0, random_state=random_state, warm_start=True, ) g.fit(X) score1 = h.fit(X).score(X) score2 = h.fit(X).score(X) assert_almost_equal(g.weights_, h.weights_) assert_almost_equal(g.means_, h.means_) assert_almost_equal(g.precisions_, h.precisions_) assert score2 > score1 # Assert that by using warm_start we can converge to a good solution g = GaussianMixture( n_components=n_components, n_init=1, max_iter=5, reg_covar=0, random_state=random_state, warm_start=False, tol=1e-6, ) h = GaussianMixture( n_components=n_components, n_init=1, max_iter=5, reg_covar=0, random_state=random_state, warm_start=True, tol=1e-6, ) g.fit(X) assert not g.converged_ h.fit(X) # depending on the data there is large variability in the number of # refit necessary to converge due to the complete randomness of the # data for _ in range(1000): h.fit(X) if h.converged_: break assert h.converged_
def test_multiclass_classifier_class_weight(): """tests multiclass with classweights for each class""" alpha = .1 n_samples = 20 tol = .00001 max_iter = 50 class_weight = {0: .45, 1: .55, 2: .75} fit_intercept = True X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0, cluster_std=0.1) step_size = get_step_size(X, alpha, fit_intercept, classification=True) classes = np.unique(y) clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples, max_iter=max_iter, tol=tol, random_state=77, fit_intercept=fit_intercept, multi_class='ovr', class_weight=class_weight) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) le = LabelEncoder() class_weight_ = compute_class_weight(class_weight, np.unique(y), y) sample_weight = class_weight_[le.fit_transform(y)] coef1 = [] intercept1 = [] coef2 = [] intercept2 = [] for cl in classes: y_encoded = np.ones(n_samples) y_encoded[y != cl] = -1 spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight) spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha, n_iter=max_iter, dloss=log_dloss, sample_weight=sample_weight, sparse=True) coef1.append(spweights1) intercept1.append(spintercept1) coef2.append(spweights2) intercept2.append(spintercept2) coef1 = np.vstack(coef1) intercept1 = np.array(intercept1) coef2 = np.vstack(coef2) intercept2 = np.array(intercept2) for i, cl in enumerate(classes): assert_array_almost_equal(clf1.coef_[i].ravel(), coef1[i].ravel(), decimal=2) assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1) assert_array_almost_equal(clf2.coef_[i].ravel(), coef2[i].ravel(), decimal=2) assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
def test_fastica_simple(add_noise, seed): # Test the FastICA algorithm on very simple data. rng = np.random.RandomState(seed) # scipy.stats uses the global RNG: n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x**3, (3 * x**2).mean(axis=-1) algos = ["parallel", "deflation"] nls = ["logcosh", "exp", "cube", g_test] whitening = [True, False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo, random_state=rng) with pytest.raises(ValueError): fastica(m.T, fun=np.tanh, algorithm=algo) else: pca = PCA(n_components=2, whiten=True, random_state=rng) X = pca.fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False, random_state=rng) with pytest.raises(ValueError): fastica(X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2) else: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=seed) ica = FastICA(fun=nl, algorithm=algo, random_state=seed) sources = ica.fit_transform(m.T) assert ica.components_.shape == (2, 2) assert sources.shape == (1000, 2) assert_array_almost_equal(sources_fun, sources) assert_array_almost_equal(sources, ica.transform(m.T)) assert ica.mixing_.shape == (2, 2) for fn in [np.tanh, "exp(-.5(x^2))"]: ica = FastICA(fun=fn, algorithm=algo) with pytest.raises(ValueError): ica.fit(m.T) with pytest.raises(TypeError): FastICA(fun=range(10)).fit(m.T)
def test_regression_metrics_at_limits(): assert_almost_equal(mean_squared_error([0.], [0.]), 0.0) assert_almost_equal(mean_squared_error([0.], [0.], squared=False), 0.0) assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.0) assert_almost_equal(mean_absolute_error([0.], [0.]), 0.0) assert_almost_equal(mean_pinball_loss([0.], [0.]), 0.0) assert_almost_equal(mean_absolute_percentage_error([0.], [0.]), 0.0) assert_almost_equal(median_absolute_error([0.], [0.]), 0.0) assert_almost_equal(max_error([0.], [0.]), 0.0) assert_almost_equal(explained_variance_score([0.], [0.]), 1.0) assert_almost_equal(r2_score([0., 1], [0., 1]), 1.0) err_msg = ("Mean Squared Logarithmic Error cannot be used when targets " "contain negative values.") with pytest.raises(ValueError, match=err_msg): mean_squared_log_error([-1.], [-1.]) err_msg = ("Mean Squared Logarithmic Error cannot be used when targets " "contain negative values.") with pytest.raises(ValueError, match=err_msg): mean_squared_log_error([1., 2., 3.], [1., -2., 3.]) err_msg = ("Mean Squared Logarithmic Error cannot be used when targets " "contain negative values.") with pytest.raises(ValueError, match=err_msg): mean_squared_log_error([1., -2., 3.], [1., 2., 3.]) # Tweedie deviance error power = -1.2 assert_allclose(mean_tweedie_deviance([0], [1.], power=power), 2 / (2 - power), rtol=1e-3) with pytest.raises(ValueError, match="can only be used on strictly positive y_pred."): mean_tweedie_deviance([0.], [0.], power=power) assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2) msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=1.0) power = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), 2 / (2 - power)) msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 2. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): mean_tweedie_deviance([0.], [0.], power=0.5)
def _do_bistochastic_test(scaled): """Check that rows and columns sum to the same constant.""" _do_scale_test(scaled) assert_almost_equal(scaled.sum(axis=0).mean(), scaled.sum(axis=1).mean(), decimal=1)
def test_regression_custom_weights(): y_true = [[1, 2], [2.5, -1], [4.5, 3], [5, 7]] y_pred = [[1, 1], [2, -1], [5, 4], [5, 6.5]] msew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6]) rmsew = mean_squared_error(y_true, y_pred, multioutput=[0.4, 0.6], squared=False) maew = mean_absolute_error(y_true, y_pred, multioutput=[0.4, 0.6]) mapew = mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.4, 0.6]) rw = r2_score(y_true, y_pred, multioutput=[0.4, 0.6]) evsw = explained_variance_score(y_true, y_pred, multioutput=[0.4, 0.6]) assert_almost_equal(msew, 0.39, decimal=2) assert_almost_equal(rmsew, 0.59, decimal=2) assert_almost_equal(maew, 0.475, decimal=3) assert_almost_equal(mapew, 0.1668, decimal=2) assert_almost_equal(rw, 0.94, decimal=2) assert_almost_equal(evsw, 0.94, decimal=2) # Handling msle separately as it does not accept negative inputs. y_true = np.array([[0.5, 1], [1, 2], [7, 6]]) y_pred = np.array([[0.5, 2], [1, 2.5], [8, 8]]) msle = mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7]) msle2 = mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred), multioutput=[0.3, 0.7]) assert_almost_equal(msle, msle2, decimal=2)
def test_oas(): # Tests OAS module on a simple dataset. # test shrinkage coeff on a simple data set X_centered = X - X.mean(axis=0) oa = OAS(assume_centered=True) oa.fit(X_centered) shrinkage_ = oa.shrinkage_ score_ = oa.score(X_centered) # compare shrunk covariance obtained from data and from MLE estimate oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered, assume_centered=True) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_) # compare estimates given by OAS and ShrunkCovariance scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True) scov.fit(X_centered) assert_array_almost_equal(scov.covariance_, oa.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0:1] oa = OAS(assume_centered=True) oa.fit(X_1d) oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_) assert_array_almost_equal((X_1d ** 2).sum() / n_samples, oa.covariance_, 4) # test shrinkage coeff on a simple data set (without saving precision) oa = OAS(store_precision=False, assume_centered=True) oa.fit(X_centered) assert_almost_equal(oa.score(X_centered), score_, 4) assert oa.precision_ is None # Same tests without assuming centered data-------------------------------- # test shrinkage coeff on a simple data set oa = OAS() oa.fit(X) assert_almost_equal(oa.shrinkage_, shrinkage_, 4) assert_almost_equal(oa.score(X), score_, 4) # compare shrunk covariance obtained from data and from MLE estimate oa_cov_from_mle, oa_shrinkage_from_mle = oas(X) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_) # compare estimates given by OAS and ShrunkCovariance scov = ShrunkCovariance(shrinkage=oa.shrinkage_) scov.fit(X) assert_array_almost_equal(scov.covariance_, oa.covariance_, 4) # test with n_features = 1 X_1d = X[:, 0].reshape((-1, 1)) oa = OAS() oa.fit(X_1d) oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d) assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4) assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_) assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4) # test with one sample # warning should be raised when using only 1 sample X_1sample = np.arange(5).reshape(1, 5) oa = OAS() warn_msg = "Only one sample available. You may want to reshape your data array" with pytest.warns(UserWarning, match=warn_msg): oa.fit(X_1sample) assert_array_almost_equal(oa.covariance_, np.zeros(shape=(5, 5), dtype=np.float64)) # test shrinkage coeff on a simple data set (without saving precision) oa = OAS(store_precision=False) oa.fit(X) assert_almost_equal(oa.score(X), score_, 4) assert oa.precision_ is None
def test_regression_metrics(n_samples=50): y_true = np.arange(n_samples) y_pred = y_true + 1 y_pred_2 = y_true - 1 assert_almost_equal(mean_squared_error(y_true, y_pred), 1.) assert_almost_equal( mean_squared_log_error(y_true, y_pred), mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred))) assert_almost_equal(mean_absolute_error(y_true, y_pred), 1.) assert_almost_equal(mean_pinball_loss(y_true, y_pred), 0.5) assert_almost_equal(mean_pinball_loss(y_true, y_pred_2), 0.5) assert_almost_equal(mean_pinball_loss(y_true, y_pred, alpha=0.4), 0.6) assert_almost_equal(mean_pinball_loss(y_true, y_pred_2, alpha=0.4), 0.4) assert_almost_equal(median_absolute_error(y_true, y_pred), 1.) mape = mean_absolute_percentage_error(y_true, y_pred) assert np.isfinite(mape) assert mape > 1e6 assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=0), mean_squared_error(y_true, y_pred)) # Tweedie deviance needs positive y_pred, except for p=0, # p>=2 needs positive y_true # results evaluated by sympy y_true = np.arange(1, 1 + n_samples) y_pred = 2 * y_true n = n_samples assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=-1), 5 / 12 * n * (n**2 + 2 * n + 1)) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=1), (n + 1) * (1 - np.log(2))) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=2), 2 * np.log(2) - 1) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3 / 2), ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum()) assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3), np.sum(1 / y_true) / (4 * n))
def test_randomized_svd_transpose_consistency(): # Check that transposing the design matrix has limited impact n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert X.shape == (n_samples, n_features) U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0) U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0) U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def test_multioutput_regression(): y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) y_pred = np.array([[0, 0, 0, 1], [1, 0, 1, 1], [0, 0, 0, 1]]) error = mean_squared_error(y_true, y_pred) assert_almost_equal(error, (1. / 3 + 2. / 3 + 2. / 3) / 4.) error = mean_squared_error(y_true, y_pred, squared=False) assert_almost_equal(error, 0.454, decimal=2) error = mean_squared_log_error(y_true, y_pred) assert_almost_equal(error, 0.200, decimal=2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. error = mean_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 2. / 3) / 4.) error = mean_pinball_loss(y_true, y_pred) assert_almost_equal(error, (1. + 2. / 3) / 8.) error = np.around(mean_absolute_percentage_error(y_true, y_pred), decimals=2) assert np.isfinite(error) assert error > 1e6 error = median_absolute_error(y_true, y_pred) assert_almost_equal(error, (1. + 1.) / 4.) error = r2_score(y_true, y_pred, multioutput='variance_weighted') assert_almost_equal(error, 1. - 5. / 2) error = r2_score(y_true, y_pred, multioutput='uniform_average') assert_almost_equal(error, -.875)
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, buffer, 1, None, random_reassign=False) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) assert new_inertia > 0.0 assert new_inertia < old_inertia # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, weight_sums_csr, buffer_csr, 1, None, random_reassign=False) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def test_bayesian_mixture_precisions_prior_initialisation(): rng = np.random.RandomState(0) n_samples, n_features = 10, 2 X = rng.rand(n_samples, n_features) # Check raise message for a bad value of degrees_of_freedom_prior bad_degrees_of_freedom_prior_ = n_features - 1. bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=bad_degrees_of_freedom_prior_, random_state=rng) assert_raise_message(ValueError, "The parameter 'degrees_of_freedom_prior' should be " "greater than %d, but got %.3f." % (n_features - 1, bad_degrees_of_freedom_prior_), bgmm.fit, X) # Check correct init for a given value of degrees_of_freedom_prior degrees_of_freedom_prior = rng.rand() + n_features - 1. bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=degrees_of_freedom_prior, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior, bgmm.degrees_of_freedom_prior_) # Check correct init for the default value of degrees_of_freedom_prior degrees_of_freedom_prior_default = n_features bgmm = BayesianGaussianMixture( degrees_of_freedom_prior=degrees_of_freedom_prior_default, random_state=rng).fit(X) assert_almost_equal(degrees_of_freedom_prior_default, bgmm.degrees_of_freedom_prior_) # Check correct init for a given value of covariance_prior covariance_prior = { 'full': np.cov(X.T, bias=1) + 10, 'tied': np.cov(X.T, bias=1) + 5, 'diag': np.diag(np.atleast_2d(np.cov(X.T, bias=1))) + 3, 'spherical': rng.rand()} bgmm = BayesianGaussianMixture(random_state=rng) for cov_type in ['full', 'tied', 'diag', 'spherical']: bgmm.covariance_type = cov_type bgmm.covariance_prior = covariance_prior[cov_type] bgmm.fit(X) assert_almost_equal(covariance_prior[cov_type], bgmm.covariance_prior_) # Check raise message for a bad spherical value of covariance_prior bad_covariance_prior_ = -1. bgmm = BayesianGaussianMixture(covariance_type='spherical', covariance_prior=bad_covariance_prior_, random_state=rng) assert_raise_message(ValueError, "The parameter 'spherical covariance_prior' " "should be greater than 0., but got %.3f." % bad_covariance_prior_, bgmm.fit, X) # Check correct init for the default value of covariance_prior covariance_prior_default = { 'full': np.atleast_2d(np.cov(X.T)), 'tied': np.atleast_2d(np.cov(X.T)), 'diag': np.var(X, axis=0, ddof=1), 'spherical': np.var(X, axis=0, ddof=1).mean()} bgmm = BayesianGaussianMixture(random_state=0) for cov_type in ['full', 'tied', 'diag', 'spherical']: bgmm.covariance_type = cov_type bgmm.fit(X) assert_almost_equal(covariance_prior_default[cov_type], bgmm.covariance_prior_)
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average", "single"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert np.size(np.unique(labels)) == 10 finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert np.size(np.unique(clustering.labels_)) == 10 # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix( connectivity.toarray()[:10, :10]), linkage=linkage) with pytest.raises(ValueError): clustering.fit(X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") with pytest.raises(ValueError): clustering.fit(X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
def test_y_multioutput(): # Test that GPR can deal with multi-dimensional target values y_2d = np.vstack((y, y * 2)).T # Test for fixed kernel that first dimension of 2d GP equals the output # of 1d GP and that second dimension is twice as large kernel = RBF(length_scale=1.0) gpr = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False) gpr.fit(X, y) gpr_2d = GaussianProcessRegressor(kernel=kernel, optimizer=None, normalize_y=False) gpr_2d.fit(X, y_2d) y_pred_1d, y_std_1d = gpr.predict(X2, return_std=True) y_pred_2d, y_std_2d = gpr_2d.predict(X2, return_std=True) _, y_cov_1d = gpr.predict(X2, return_cov=True) _, y_cov_2d = gpr_2d.predict(X2, return_cov=True) assert_almost_equal(y_pred_1d, y_pred_2d[:, 0]) assert_almost_equal(y_pred_1d, y_pred_2d[:, 1] / 2) # Standard deviation and covariance do not depend on output assert_almost_equal(y_std_1d, y_std_2d) assert_almost_equal(y_cov_1d, y_cov_2d) y_sample_1d = gpr.sample_y(X2, n_samples=10) y_sample_2d = gpr_2d.sample_y(X2, n_samples=10) assert_almost_equal(y_sample_1d, y_sample_2d[:, 0]) # Test hyperparameter optimization for kernel in kernels: gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True) gpr.fit(X, y) gpr_2d = GaussianProcessRegressor(kernel=kernel, normalize_y=True) gpr_2d.fit(X, np.vstack((y, y)).T) assert_almost_equal(gpr.kernel_.theta, gpr_2d.kernel_.theta, 4)
def test_adjusted_mutual_info_score(): # Compute the Adjusted Mutual Information and test against known values labels_a = np.array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]) labels_b = np.array([1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 3, 1, 3, 3, 3, 2, 2]) # Mutual information mi = mutual_info_score(labels_a, labels_b) assert_almost_equal(mi, 0.41022, 5) # with provided sparse contingency C = contingency_matrix(labels_a, labels_b, sparse=True) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # with provided dense contingency C = contingency_matrix(labels_a, labels_b) mi = mutual_info_score(labels_a, labels_b, contingency=C) assert_almost_equal(mi, 0.41022, 5) # Expected mutual information n_samples = C.sum() emi = expected_mutual_information(C, n_samples) assert_almost_equal(emi, 0.15042, 5) # Adjusted mutual information ami = adjusted_mutual_info_score(labels_a, labels_b) assert_almost_equal(ami, 0.27821, 5) ami = adjusted_mutual_info_score([1, 1, 2, 2], [2, 2, 3, 3]) assert ami == pytest.approx(1.0) # Test with a very large array a110 = np.array([list(labels_a) * 110]).flatten() b110 = np.array([list(labels_b) * 110]).flatten() ami = adjusted_mutual_info_score(a110, b110) assert_almost_equal(ami, 0.38, 2)
def test_enet_path(): # We use a large number of samples and of informative features so that # the l1_ratio selected is more toward ridge than lasso X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100, n_informative_features=100) max_iter = 150 # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have selected an elastic-net # that is closer to ridge than to lasso assert clf.l1_ratio_ == min(clf.l1_ratio) clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter, precompute=True) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have selected an elastic-net # that is closer to ridge than to lasso assert clf.l1_ratio_ == min(clf.l1_ratio) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert clf.score(X_test, y_test) > 0.99 # Multi-output/target case X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3) clf = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert clf.score(X_test, y_test) > 0.99 assert clf.coef_.shape == (3, 10) # Mono-output should have same cross-validated alpha_ and l1_ratio_ # in both cases. X, y, _, _ = build_dataset(n_features=10) clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf1.fit(X, y) clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf2.fit(X, y[:, np.newaxis]) assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_) assert_almost_equal(clf1.alpha_, clf2.alpha_)
def test_importances_asymptotic(): # Check whether variable importances of totally randomized trees # converge towards their theoretical values (See Louppe et al, # Understanding variable importances in forests of randomized trees, 2013). def binomial(k, n): return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True) def entropy(samples): n_samples = len(samples) entropy = 0. for count in np.bincount(samples): p = 1. * count / n_samples if p > 0: entropy -= p * np.log2(p) return entropy def mdi_importance(X_m, X, y): n_samples, n_features = X.shape features = list(range(n_features)) features.pop(X_m) values = [np.unique(X[:, i]) for i in range(n_features)] imp = 0. for k in range(n_features): # Weight of each B of size k coef = 1. / (binomial(k, n_features) * (n_features - k)) # For all B of size k for B in combinations(features, k): # For all values B=b for b in product(*[values[B[j]] for j in range(k)]): mask_b = np.ones(n_samples, dtype=np.bool) for j in range(k): mask_b &= X[:, B[j]] == b[j] X_, y_ = X[mask_b, :], y[mask_b] n_samples_b = len(X_) if n_samples_b > 0: children = [] for xi in values[X_m]: mask_xi = X_[:, X_m] == xi children.append(y_[mask_xi]) imp += ( coef * (1. * n_samples_b / n_samples) # P(B=b) * (entropy(y_) - sum([ entropy(c) * len(c) / n_samples_b for c in children ]))) return imp data = np.array([[0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 1, 1, 1, 0, 1, 2], [1, 0, 1, 1, 0, 1, 1, 3], [0, 1, 1, 1, 0, 1, 0, 4], [1, 1, 0, 1, 0, 1, 1, 5], [1, 1, 0, 1, 1, 1, 1, 6], [1, 0, 1, 0, 0, 1, 0, 7], [1, 1, 1, 1, 1, 1, 1, 8], [1, 1, 1, 1, 0, 1, 1, 9], [1, 1, 1, 0, 1, 1, 1, 0]]) X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7] n_features = X.shape[1] # Compute true importances true_importances = np.zeros(n_features) for i in range(n_features): true_importances[i] = mdi_importance(i, X, y) # Estimate importances with totally randomized trees clf = ExtraTreesClassifier(n_estimators=500, max_features=1, criterion="entropy", random_state=0).fit(X, y) importances = sum( tree.tree_.compute_feature_importances(normalize=False) for tree in clf.estimators_) / clf.n_estimators # Check correctness assert_almost_equal(entropy(y), sum(importances)) assert np.abs(true_importances - importances).mean() < 0.01
def test_compare_covar_type(): # We can compare the 'full' precision with the other cov_type if we apply # 1 iter of the M-step (done during _initialize_parameters). rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) X = rand_data.X['full'] n_components = rand_data.n_components for prior_type in PRIOR_TYPE: # Computation of the full_covariance bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type='full', max_iter=1, random_state=0, tol=1e-7) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) full_covariances = ( bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis, np.newaxis]) # Check tied_covariance = mean(full_covariances, 0) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type='tied', max_iter=1, random_state=0, tol=1e-7) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) tied_covariance = bgmm.covariances_ * bgmm.degrees_of_freedom_ assert_almost_equal(tied_covariance, np.mean(full_covariances, 0)) # Check diag_covariance = diag(full_covariances) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type='diag', max_iter=1, random_state=0, tol=1e-7) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) diag_covariances = (bgmm.covariances_ * bgmm.degrees_of_freedom_[:, np.newaxis]) assert_almost_equal( diag_covariances, np.array([np.diag(cov) for cov in full_covariances])) # Check spherical_covariance = np.mean(diag_covariances, 0) bgmm = BayesianGaussianMixture( weight_concentration_prior_type=prior_type, n_components=2 * n_components, covariance_type='spherical', max_iter=1, random_state=0, tol=1e-7) bgmm._check_initial_parameters(X) bgmm._initialize_parameters(X, np.random.RandomState(0)) spherical_covariances = bgmm.covariances_ * bgmm.degrees_of_freedom_ assert_almost_equal(spherical_covariances, np.mean(diag_covariances, 1))