def test_zero_estimator_reg(): # Test if ZeroEstimator works for regression. est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init=ZeroEstimator()) est.fit(boston.data, boston.target) y_pred = est.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_almost_equal(mse, 33.0, decimal=0) est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init='zero') est.fit(boston.data, boston.target) y_pred = est.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_almost_equal(mse, 33.0, decimal=0) est = GradientBoostingRegressor(n_estimators=20, max_depth=1, random_state=1, init='foobar') assert_raises(ValueError, est.fit, boston.data, boston.target)
def test_incremental_variance_ddof(): # Test that degrees of freedom parameter for calculations are correct. rng = np.random.RandomState(1999) X = rng.randn(50, 10) n_samples, n_features = X.shape for batch_size in [11, 20, 37]: steps = np.arange(0, X.shape[0], batch_size) if steps[-1] != X.shape[0]: steps = np.hstack([steps, n_samples]) for i, j in zip(steps[:-1], steps[1:]): batch = X[i:j, :] if i == 0: incremental_means = batch.mean(axis=0) incremental_variances = batch.var(axis=0) # Assign this twice so that the test logic is consistent incremental_count = batch.shape[0] sample_count = batch.shape[0] else: result = _incremental_mean_and_var(batch, incremental_means, incremental_variances, sample_count) (incremental_means, incremental_variances, incremental_count) = result sample_count += batch.shape[0] calculated_means = np.mean(X[:j], axis=0) calculated_variances = np.var(X[:j], axis=0) assert_almost_equal(incremental_means, calculated_means, 6) assert_almost_equal(incremental_variances, calculated_variances, 6) assert_equal(incremental_count, sample_count)
def test_pinvh_nonpositive(): a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.float64) a = np.dot(a, a.T) u, s, vt = np.linalg.svd(a) s[0] *= -1 a = np.dot(u * s, vt) # a is now symmetric non-positive and singular a_pinv = pinv2(a) a_pinvh = pinvh(a) assert_almost_equal(a_pinv, a_pinvh)
def test_logsumexp(): # Try to add some smallish numbers in logspace x = np.array([1e-40] * 1000000) logx = np.log(x) assert_almost_equal(np.exp(logsumexp(logx)), x.sum()) X = np.vstack([x, x]) logX = np.vstack([logx, logx]) assert_array_almost_equal(np.exp(logsumexp(logX, axis=0)), X.sum(axis=0)) assert_array_almost_equal(np.exp(logsumexp(logX, axis=1)), X.sum(axis=1))
def test_randomized_svd_sign_flip(): a = np.array([[2.0, 0.0], [0.0, 1.0]]) u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41) for seed in range(10): u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed) assert_almost_equal(u1, u2) assert_almost_equal(v1, v2) assert_almost_equal(np.dot(u2 * s2, v2), a) assert_almost_equal(np.dot(u2.T, u2), np.eye(2)) assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
def check_class_weights(name): # Check class_weights resemble sample_weights behavior. ForestClassifier = FOREST_CLASSIFIERS[name] # Iris is balanced, so no effect expected for using 'balanced' weights clf1 = ForestClassifier(random_state=0) clf1.fit(iris.data, iris.target) clf2 = ForestClassifier(class_weight='balanced', random_state=0) clf2.fit(iris.data, iris.target) assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) # Make a multi-output problem with three copies of Iris iris_multi = np.vstack((iris.target, iris.target, iris.target)).T # Create user-defined weights that should balance over the outputs clf3 = ForestClassifier(class_weight=[{ 0: 2., 1: 2., 2: 1. }, { 0: 2., 1: 1., 2: 2. }, { 0: 1., 1: 2., 2: 2. }], random_state=0) clf3.fit(iris.data, iris_multi) assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_) # Check against multi-output "balanced" which should also have no effect clf4 = ForestClassifier(class_weight='balanced', random_state=0) clf4.fit(iris.data, iris_multi) assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_) # Inflate importance of class 1, check against user-defined weights sample_weight = np.ones(iris.target.shape) sample_weight[iris.target == 1] *= 100 class_weight = {0: 1., 1: 100., 2: 1.} clf1 = ForestClassifier(random_state=0) clf1.fit(iris.data, iris.target, sample_weight) clf2 = ForestClassifier(class_weight=class_weight, random_state=0) clf2.fit(iris.data, iris.target) assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) # Check that sample_weight and class_weight are multiplicative clf1 = ForestClassifier(random_state=0) clf1.fit(iris.data, iris.target, sample_weight**2) clf2 = ForestClassifier(class_weight=class_weight, random_state=0) clf2.fit(iris.data, iris.target, sample_weight) assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
def test_compute_class_weight(): # Test (and demo) compute_class_weight. y = np.asarray([2, 2, 2, 3, 3, 4]) classes = np.unique(y) cw = assert_warns(DeprecationWarning, compute_class_weight, "auto", classes, y) assert_almost_equal(cw.sum(), classes.shape) assert_true(cw[0] < cw[1] < cw[2]) cw = compute_class_weight("balanced", classes, y) # total effect of samples is preserved class_counts = np.bincount(y)[2:] assert_almost_equal(np.dot(cw, class_counts), y.shape[0]) assert_true(cw[0] < cw[1] < cw[2])
def test_compute_class_weight_auto_unordered(): # Test compute_class_weight when classes are unordered classes = np.array([1, 0, 3]) y = np.asarray([1, 0, 0, 3, 3, 3]) cw = assert_warns(DeprecationWarning, compute_class_weight, "auto", classes, y) assert_almost_equal(cw.sum(), classes.shape) assert_equal(len(cw), len(classes)) assert_array_almost_equal(cw, np.array([1.636, 0.818, 0.545]), decimal=3) cw = compute_class_weight("balanced", classes, y) class_counts = np.bincount(y)[classes] assert_almost_equal(np.dot(cw, class_counts), y.shape[0]) assert_array_almost_equal(cw, [2., 1., 2. / 3])
def test_compute_class_weight_auto_negative(): # Test compute_class_weight when labels are negative # Test with balanced class labels. classes = np.array([-2, -1, 0]) y = np.asarray([-1, -1, 0, 0, -2, -2]) cw = assert_warns(DeprecationWarning, compute_class_weight, "auto", classes, y) assert_almost_equal(cw.sum(), classes.shape) assert_equal(len(cw), len(classes)) assert_array_almost_equal(cw, np.array([1., 1., 1.])) cw = compute_class_weight("balanced", classes, y) assert_equal(len(cw), len(classes)) assert_array_almost_equal(cw, np.array([1., 1., 1.])) # Test with unbalanced class labels. y = np.asarray([-1, 0, 0, -2, -2, -2]) cw = assert_warns(DeprecationWarning, compute_class_weight, "auto", classes, y) assert_almost_equal(cw.sum(), classes.shape) assert_equal(len(cw), len(classes)) assert_array_almost_equal(cw, np.array([0.545, 1.636, 0.818]), decimal=3) cw = compute_class_weight("balanced", classes, y) assert_equal(len(cw), len(classes)) class_counts = np.bincount(y + 2) assert_almost_equal(np.dot(cw, class_counts), y.shape[0]) assert_array_almost_equal(cw, [2. / 3, 2., 1.])
def test_randomized_svd_low_rank_with_noise(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X wity structure approximate rank `rank` and an # important noisy component X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.1, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate # method without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.01) # compute the singular values of X using the fast approximate # method with iterated power method _, sap, _ = randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) # the iterated power method is helping getting rid of the noise: assert_almost_equal(s[:k], sap, decimal=3)
def test_randomized_svd_infinite_rank(): # Check that extmath.randomized_svd can handle noisy matrices n_samples = 100 n_features = 500 rank = 5 k = 10 # let us try again without 'low_rank component': just regularly but slowly # decreasing singular values: the rank of the data matrix is infinite X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=1.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method _, s, _ = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'none', 'LU', 'QR']: # compute the singular values of X using the fast approximate method # without the iterated power method _, sa, _ = randomized_svd(X, k, n_iter=0, power_iteration_normalizer=normalizer) # the approximation does not tolerate the noise: assert_greater(np.abs(s[:k] - sa).max(), 0.1) # compute the singular values of X using the fast approximate method # with iterated power method _, sap, _ = randomized_svd(X, k, n_iter=5, power_iteration_normalizer=normalizer) # the iterated power method is still managing to get most of the # structure at the requested rank assert_almost_equal(s[:k], sap, decimal=3)
def test_norm_squared_norm(): X = np.random.RandomState(42).randn(50, 63) X *= 100 # check stability X += 200 assert_almost_equal(np.linalg.norm(X.ravel()), norm(X)) assert_almost_equal(norm(X)**2, squared_norm(X), decimal=6) assert_almost_equal(np.linalg.norm(X), np.sqrt(squared_norm(X)), decimal=6)
def test_svd_flip(): # Check that svd_flip works in both situations, and reconstructs input. rs = np.random.RandomState(1999) n_samples = 20 n_features = 10 X = rs.randn(n_samples, n_features) # Check matrix reconstruction U, S, V = linalg.svd(X, full_matrices=False) U1, V1 = svd_flip(U, V, u_based_decision=False) assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6) # Check transposed matrix reconstruction XT = X.T U, S, V = linalg.svd(XT, full_matrices=False) U2, V2 = svd_flip(U, V, u_based_decision=True) assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6) # Check that different flip methods are equivalent under reconstruction U_flip1, V_flip1 = svd_flip(U, V, u_based_decision=True) assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6) U_flip2, V_flip2 = svd_flip(U, V, u_based_decision=False) assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
def test_incremental_variance_update_formulas(): # Test Youngs and Cramer incremental variance formulas. # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html A = np.array([[600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300], [600, 470, 170, 430, 300]]).T idx = 2 X1 = A[:idx, :] X2 = A[idx:, :] old_means = X1.mean(axis=0) old_variances = X1.var(axis=0) old_sample_count = X1.shape[0] final_means, final_variances, final_count = \ _incremental_mean_and_var(X2, old_means, old_variances, old_sample_count) assert_almost_equal(final_means, A.mean(axis=0), 6) assert_almost_equal(final_variances, A.var(axis=0), 6) assert_almost_equal(final_count, A.shape[0])
def test_randomized_svd_low_rank(): # Check that extmath.randomized_svd is consistent with linalg.svd n_samples = 100 n_features = 500 rank = 5 k = 10 # generate a matrix X of approximate effective rank `rank` and no noise # component (very structured signal): X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.0, random_state=0) assert_equal(X.shape, (n_samples, n_features)) # compute the singular values of X using the slow exact method U, s, V = linalg.svd(X, full_matrices=False) for normalizer in ['auto', 'LU', 'QR']: # 'none' would not be stable # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) assert_equal(Ua.shape, (n_samples, k)) assert_equal(sa.shape, (k, )) assert_equal(Va.shape, (k, n_features)) # ensure that the singular values of both methods are equal up to the # real rank of the matrix assert_almost_equal(s[:k], sa) # check the singular vectors too (while not checking the sign) assert_almost_equal(np.dot(U[:, :k], V[:k, :]), np.dot(Ua, Va)) # check the sparse matrix representation X = sparse.csr_matrix(X) # compute the singular values of X using the fast approximate method Ua, sa, Va = \ randomized_svd(X, k, power_iteration_normalizer=normalizer, random_state=0) assert_almost_equal(s[:rank], sa[:rank])
def test_pinvh_simple_complex(): a = (np.array([[1, 2, 3], [4, 5, 6], [7, 8, 10]]) + 1j * np.array([[10, 8, 7], [6, 5, 4], [3, 2, 1]])) a = np.dot(a, a.conj().T) a_pinv = pinvh(a) assert_almost_equal(np.dot(a, a_pinv), np.eye(3))
def test_randomized_svd_transpose_consistency(): # Check that transposing the design matrix has limited impact n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0) U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0) U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def test_fast_dot(): # Check fast dot blas wrapper function if fast_dot is np.dot: return rng = np.random.RandomState(42) A = rng.random_sample([2, 10]) B = rng.random_sample([2, 10]) try: linalg.get_blas_funcs(['gemm'])[0] has_blas = True except (AttributeError, ValueError): has_blas = False if has_blas: # Test _fast_dot for invalid input. # Maltyped data. for dt1, dt2 in [['f8', 'f4'], ['i4', 'i4']]: assert_raises(ValueError, _fast_dot, A.astype(dt1), B.astype(dt2).T) # Malformed data. # ndim == 0 E = np.empty(0) assert_raises(ValueError, _fast_dot, E, E) # ndim == 1 assert_raises(ValueError, _fast_dot, A, A[0]) # ndim > 2 assert_raises(ValueError, _fast_dot, A.T, np.array([A, A])) # min(shape) == 1 assert_raises(ValueError, _fast_dot, A, A[0, :][None, :]) # test for matrix mismatch error assert_raises(ValueError, _fast_dot, A, A) # Test cov-like use case + dtypes. for dtype in ['f8', 'f4']: A = A.astype(dtype) B = B.astype(dtype) # col < row C = np.dot(A.T, A) C_ = fast_dot(A.T, A) assert_almost_equal(C, C_, decimal=5) C = np.dot(A.T, B) C_ = fast_dot(A.T, B) assert_almost_equal(C, C_, decimal=5) C = np.dot(A, B.T) C_ = fast_dot(A, B.T) assert_almost_equal(C, C_, decimal=5) # Test square matrix * rectangular use case. A = rng.random_sample([2, 2]) for dtype in ['f8', 'f4']: A = A.astype(dtype) B = B.astype(dtype) C = np.dot(A, B) C_ = fast_dot(A, B) assert_almost_equal(C, C_, decimal=5) C = np.dot(A.T, B) C_ = fast_dot(A.T, B) assert_almost_equal(C, C_, decimal=5) if has_blas: for x in [np.array([[d] * 10] * 2) for d in [np.inf, np.nan]]: assert_raises(ValueError, _fast_dot, x, x.T)
def test_pinvh_simple_real(): a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 10]], dtype=np.float64) a = np.dot(a, a.T) a_pinv = pinvh(a) assert_almost_equal(np.dot(a, a_pinv), np.eye(3))
def test_importances_asymptotic(): # Check whether variable importances of totally randomized trees # converge towards their theoretical values (See Louppe et al, # Understanding variable importances in forests of randomized trees, 2013). def binomial(k, n): return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True) def entropy(samples): n_samples = len(samples) entropy = 0. for count in bincount(samples): p = 1. * count / n_samples if p > 0: entropy -= p * np.log2(p) return entropy def mdi_importance(X_m, X, y): n_samples, n_features = X.shape features = list(range(n_features)) features.pop(X_m) values = [np.unique(X[:, i]) for i in range(n_features)] imp = 0. for k in range(n_features): # Weight of each B of size k coef = 1. / (binomial(k, n_features) * (n_features - k)) # For all B of size k for B in combinations(features, k): # For all values B=b for b in product(*[values[B[j]] for j in range(k)]): mask_b = np.ones(n_samples, dtype=np.bool) for j in range(k): mask_b &= X[:, B[j]] == b[j] X_, y_ = X[mask_b, :], y[mask_b] n_samples_b = len(X_) if n_samples_b > 0: children = [] for xi in values[X_m]: mask_xi = X_[:, X_m] == xi children.append(y_[mask_xi]) imp += ( coef * (1. * n_samples_b / n_samples) # P(B=b) * (entropy(y_) - sum([ entropy(c) * len(c) / n_samples_b for c in children ]))) return imp data = np.array([[0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 1, 1, 1, 0, 1, 2], [1, 0, 1, 1, 0, 1, 1, 3], [0, 1, 1, 1, 0, 1, 0, 4], [1, 1, 0, 1, 0, 1, 1, 5], [1, 1, 0, 1, 1, 1, 1, 6], [1, 0, 1, 0, 0, 1, 0, 7], [1, 1, 1, 1, 1, 1, 1, 8], [1, 1, 1, 1, 0, 1, 1, 9], [1, 1, 1, 0, 1, 1, 1, 0]]) X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7] n_features = X.shape[1] # Compute true importances true_importances = np.zeros(n_features) for i in range(n_features): true_importances[i] = mdi_importance(i, X, y) # Estimate importances with totally randomized trees clf = ExtraTreesClassifier(n_estimators=500, max_features=1, criterion="entropy", random_state=0).fit(X, y) importances = sum( tree.tree_.compute_feature_importances(normalize=False) for tree in clf.estimators_) / clf.n_estimators # Check correctness assert_almost_equal(entropy(y), sum(importances)) assert_less(np.abs(true_importances - importances).mean(), 0.01)