def check_importances(name, X, y): """Check variable importances.""" ForestClassifier = FOREST_CLASSIFIERS[name] for n_jobs in [1, 2]: clf = ForestClassifier(n_estimators=10, n_jobs=n_jobs) clf.fit(X, y) importances = clf.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) X_new = clf.transform(X, threshold="mean") assert_less(0 < X_new.shape[1], X.shape[1]) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 clf = ForestClassifier(n_estimators=50, n_jobs=n_jobs, random_state=0) clf.fit(X, y, sample_weight=sample_weight) importances = clf.feature_importances_ assert_true(np.all(importances >= 0.0)) clf = ForestClassifier(n_estimators=50, n_jobs=n_jobs, random_state=0) clf.fit(X, y, sample_weight=3 * sample_weight) importances_bis = clf.feature_importances_ assert_almost_equal(importances, importances_bis)
def test_sparse_enet_not_as_toy_dataset(): n_samples, n_features, max_iter = 100, 100, 1000 n_informative = 10 X, y = make_sparse_data(n_samples, n_features, n_informative) X_train, X_test = X[n_samples / 2:], X[:n_samples / 2] y_train, y_test = y[n_samples / 2:], y[:n_samples / 2] s_clf = SparseENet(alpha=0.1, rho=0.8, fit_intercept=False, max_iter=max_iter, tol=1e-7) s_clf.fit(X_train, y_train) assert_almost_equal(s_clf.dual_gap_, 0, 4) assert_greater(s_clf.score(X_test, y_test), 0.85) # check the convergence is the same as the dense version d_clf = DenseENet(alpha=0.1, rho=0.8, fit_intercept=False, max_iter=max_iter, tol=1e-7) d_clf.fit(X_train, y_train) assert_almost_equal(d_clf.dual_gap_, 0, 4) assert_greater(d_clf.score(X_test, y_test), 0.85) assert_almost_equal(s_clf.coef_, d_clf.coef_, 5) # check that the coefs are sparse assert_less(np.sum(s_clf.coef_ != 0.0), 2 * n_informative)
def test_feature_importances_2d_coef(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, n_classes=4, ) est = LogisticRegression() for threshold, func in zip(["mean", "median"], [np.mean, np.median]): for order in [1, 2, np.inf]: # Fit SelectFromModel a multi-class problem transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, "coef_")) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) # Manually check that the norm is correctly performed est.fit(X, y) importances = norm(est.coef_, axis=0, ord=order) feature_mask = importances > func(importances) assert_array_equal(X_new, X[:, feature_mask])
def check_boston(presort, loss, subsample): # Check consistency on dataset boston house prices with least squares # and least absolute deviation. ones = np.ones(len(boston.target)) last_y_pred = None for sample_weight in None, ones, 2 * ones: clf = GradientBoostingRegressor(n_estimators=100, loss=loss, max_depth=4, subsample=subsample, min_samples_split=2, random_state=1, presort=presort) assert_raises(ValueError, clf.predict, boston.data) clf.fit(boston.data, boston.target, sample_weight=sample_weight) leaves = clf.apply(boston.data) assert_equal(leaves.shape, (506, 100)) y_pred = clf.predict(boston.data) mse = mean_squared_error(boston.target, y_pred) assert_less(mse, 6.0) if last_y_pred is not None: assert_array_almost_equal(last_y_pred, y_pred) last_y_pred = y_pred
def test_lars_drop_for_good(): # Create an ill-conditioned situation in which the LARS has to go # far in the path to converge, and check that LARS and coordinate # descent give the same answers X = [[1e20, 1e20, 0], [-1e-32, 0, 0], [1, 1, 1]] y = [10, 10, 1] alpha = .0001 def objective_function(coef): return (1. / (2. * len(X)) * linalg.norm(y - np.dot(X, coef)) ** 2 + alpha * linalg.norm(coef, 1)) lars = linear_model.LassoLars(alpha=alpha, normalize=False) assert_warns(ConvergenceWarning, lars.fit, X, y) lars_coef_ = lars.coef_ lars_obj = objective_function(lars_coef_) coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-10, normalize=False) with ignore_warnings(): cd_coef_ = coord_descent.fit(X, y).coef_ cd_obj = objective_function(cd_coef_) assert_less(lars_obj, cd_obj * (1. + 1e-8))
def test_oob_score_classification(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for base_estimator in [DecisionTreeClassifier(), SVC()]: clf = BaggingClassifier(base_estimator=base_estimator, n_estimators=100, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns(UserWarning, BaggingClassifier(base_estimator=base_estimator, n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def test_dense_liblinear_intercept_handling(classifier=svm.LinearSVC): # Test that dense liblinear honours intercept_scaling param X = [[2, 1], [3, 1], [1, 3], [2, 3]] y = [0, 0, 1, 1] clf = classifier(fit_intercept=True, penalty='l1', loss='squared_hinge', dual=False, C=4, tol=1e-7, random_state=0) assert_true(clf.intercept_scaling == 1, clf.intercept_scaling) assert_true(clf.fit_intercept) # when intercept_scaling is low the intercept value is highly "penalized" # by regularization clf.intercept_scaling = 1 clf.fit(X, y) assert_almost_equal(clf.intercept_, 0, decimal=5) # when intercept_scaling is sufficiently high, the intercept value # is not affected by regularization clf.intercept_scaling = 100 clf.fit(X, y) intercept1 = clf.intercept_ assert_less(intercept1, -1) # when intercept_scaling is sufficiently high, the intercept value # doesn't depend on intercept_scaling value clf.intercept_scaling = 1000 clf.fit(X, y) intercept2 = clf.intercept_ assert_array_almost_equal(intercept1, intercept2, decimal=2)
def test_fitted_model(self): # non centered, sparse centers to check the centers = np.array([ [0.0, 5.0, 0.0, 0.0, 0.0], [1.0, 1.0, 4.0, 0.0, 0.0], [1.0, 0.0, 0.0, 5.0, 1.0], ]) n_samples = 100 n_clusters, n_features = centers.shape X, true_labels = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1., random_state=42) cbook = CoodeBook(n_words=3) cbook = cbook.fit(X) # TODO: Is it neaded to reasign? or it can be just cbook.fit(X) # check that the number of clusters centers and distinct labels match # the expectation centers = cbook.get_dictionary() assert_equal(centers.shape, (n_clusters, n_features)) labels = cbook.predict(X) assert_equal(np.unique(labels).shape[0], n_clusters) # check that the labels assignment are perfect (up to a permutation) assert_equal(v_measure_score(true_labels, labels), 1.0) assert_greater(cbook.cluster_core.inertia_, 0.0) # check that the descriptor looks like the homogenous PDF used # to create the original samples cbook_hist = cbook.get_BoF_descriptor(X) expected_value = float(1)/cbook.n_words for bin_value in cbook_hist[0]: assert_less(round(bin_value-expected_value,3), 0.01)
def test_oob_score_regression(): # Check that oob prediction is a good estimation of the generalization # error. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) clf = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=50, bootstrap=True, oob_score=True, random_state=rng).fit(X_train, y_train) test_score = clf.score(X_test, y_test) assert_less(abs(test_score - clf.oob_score_), 0.1) # Test with few estimators assert_warns(UserWarning, BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=1, bootstrap=True, oob_score=True, random_state=rng).fit, X_train, y_train)
def check_minimize(func, y_opt, bounds, acq_optimizer, acq_func, margin, n_calls, n_random_starts=10): r = gp_minimize(func, bounds, acq_optimizer=acq_optimizer, acq_func=acq_func, n_random_starts=n_random_starts, n_calls=n_calls, random_state=1, noise=1e-10) assert_less(r.fun, y_opt + margin)
def test_importances(): """Check variable importances.""" X, y = datasets.make_classification(n_samples=2000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y) importances = clf.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10, "Failed with {0}".format(name)) assert_equal(n_important, 3, "Failed with {0}".format(name)) X_new = clf.transform(X, threshold="mean") assert_less(0, X_new.shape[1], "Failed with {0}".format(name)) assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name)) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) clf.fit(iris.data, iris.target) clf2 = DecisionTreeClassifier(random_state=0, max_leaf_nodes=len(iris.data)) clf2.fit(iris.data, iris.target) assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
def test_lasso_lars_vs_lasso_cd_ill_conditioned2(): # Create an ill-conditioned situation in which the LARS has to go # far in the path to converge, and check that LARS and coordinate # descent give the same answers # Note it used to be the case that Lars had to use the drop for good # strategy for this but this is no longer the case with the # equality_tolerance checks X = [[1e20, 1e20, 0], [-1e-32, 0, 0], [1, 1, 1]] y = [10, 10, 1] alpha = .0001 def objective_function(coef): return (1. / (2. * len(X)) * linalg.norm(y - np.dot(X, coef)) ** 2 + alpha * linalg.norm(coef, 1)) lars = linear_model.LassoLars(alpha=alpha, normalize=False) assert_warns(ConvergenceWarning, lars.fit, X, y) lars_coef_ = lars.coef_ lars_obj = objective_function(lars_coef_) coord_descent = linear_model.Lasso(alpha=alpha, tol=1e-4, normalize=False) cd_coef_ = coord_descent.fit(X, y).coef_ cd_obj = objective_function(cd_coef_) assert_less(lars_obj, cd_obj * (1. + 1e-8))
def test_rank_deficient_design(): # consistency test that checks that LARS Lasso is handling rank # deficient input data (with n_features < rank) in the same way # as coordinate descent Lasso y = [5, 0, 5] for X in ([[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]], ): # To be able to use the coefs to compute the objective function, # we need to turn off normalization lars = linear_model.LassoLars(.1, normalize=False) coef_lars_ = lars.fit(X, y).coef_ obj_lars = (1. / (2. * 3.) * linalg.norm(y - np.dot(X, coef_lars_)) ** 2 + .1 * linalg.norm(coef_lars_, 1)) coord_descent = linear_model.Lasso(.1, tol=1e-6, normalize=False) coef_cd_ = coord_descent.fit(X, y).coef_ obj_cd = ((1. / (2. * 3.)) * linalg.norm(y - np.dot(X, coef_cd_)) ** 2 + .1 * linalg.norm(coef_cd_, 1)) assert_less(obj_lars, obj_cd * (1. + 1e-8))
def _test_sparse_enet_not_as_toy_dataset(alpha, fit_intercept, positive): n_samples, n_features, max_iter = 100, 100, 1000 n_informative = 10 X, y = make_sparse_data(n_samples, n_features, n_informative, positive=positive) X_train, X_test = X[n_samples / 2:], X[:n_samples / 2] y_train, y_test = y[n_samples / 2:], y[:n_samples / 2] s_clf = ElasticNet(alpha=alpha, l1_ratio=0.8, fit_intercept=fit_intercept, max_iter=max_iter, tol=1e-7, positive=positive, warm_start=True) s_clf.fit(X_train, y_train) assert_almost_equal(s_clf.dual_gap_, 0, 4) assert_greater(s_clf.score(X_test, y_test), 0.85) # check the convergence is the same as the dense version d_clf = ElasticNet(alpha=alpha, l1_ratio=0.8, fit_intercept=fit_intercept, max_iter=max_iter, tol=1e-7, positive=positive, warm_start=True) d_clf.fit(X_train.todense(), y_train) assert_almost_equal(d_clf.dual_gap_, 0, 4) assert_greater(d_clf.score(X_test, y_test), 0.85) assert_almost_equal(s_clf.coef_, d_clf.coef_, 5) assert_almost_equal(s_clf.intercept_, d_clf.intercept_, 5) # check that the coefs are sparse assert_less(np.sum(s_clf.coef_ != 0.0), 2 * n_informative)
def test_nmf_fit_close(solver): rng = np.random.mtrand.RandomState(42) # Test that the fit is not too far away pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, max_iter=600) X = np.abs(rng.randn(6, 5)) assert_less(pnmf.fit(X).reconstruction_err_, 0.1)
def test_importances(): """Check variable importances.""" X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) clf = RandomForestClassifier(n_estimators=10) clf.fit(X, y) importances = clf.feature_importances_ n_important = sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) X_new = clf.transform(X, threshold="mean") assert_less(0 < X_new.shape[1], X.shape[1]) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 clf = RandomForestClassifier(n_estimators=50, random_state=0) clf.fit(X, y, sample_weight=sample_weight) importances = clf.feature_importances_ assert np.all(importances >= 0.0) clf = RandomForestClassifier(n_estimators=50, random_state=0) clf.fit(X, y, sample_weight=3*sample_weight) importances_bis = clf.feature_importances_ assert_almost_equal(importances, importances_bis)
def test_lasso_lars_vs_lasso_cd_early_stopping(verbose=False): # Test that LassoLars and Lasso using coordinate descent give the # same results when early stopping is used. # (test : before, in the middle, and in the last part of the path) alphas_min = [10, 0.9, 1e-4] for alpha_min in alphas_min: alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso', alpha_min=alpha_min) lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8) lasso_cd.alpha = alphas[-1] lasso_cd.fit(X, y) error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_) assert_less(error, 0.01) # same test, with normalization for alpha_min in alphas_min: alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso', alpha_min=alpha_min) lasso_cd = linear_model.Lasso(fit_intercept=True, normalize=True, tol=1e-8) lasso_cd.alpha = alphas[-1] lasso_cd.fit(X, y) error = linalg.norm(lasso_path[:, -1] - lasso_cd.coef_) assert_less(error, 0.01)
def check_minimize(minimizer, func, y_opt, dimensions, margin, n_calls, n_random_starts=10, x0=None): for n in range(3): r = minimizer( func, dimensions, n_calls=n_calls, random_state=n, n_random_starts=n_random_starts, x0=x0) assert_less(r.fun, y_opt + margin)
def test_sparse_encode_error(): n_atoms = 12 V = rng.randn(n_atoms, n_features) # random init V /= np.sum(V ** 2, axis=1)[:, np.newaxis] code = sparse_encode(X, V, alpha=0.001) assert_true(not np.all(code == 0)) assert_less(np.sqrt(np.sum((np.dot(code, V) - X) ** 2)), 0.1)
def test_factor_analysis(): """Test FactorAnalysis ability to recover the data covariance structure """ rng = np.random.RandomState(0) n_samples, n_features, n_components = 20, 5, 3 # Some random settings for the generative model W = rng.randn(n_components, n_features) # latent variable of dim 3, 20 of it h = rng.randn(n_samples, n_components) # using gamma to model different noise variance # per component noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features) # generate observations # wlog, mean is 0 X = np.dot(h, W) + noise assert_raises(ValueError, FactorAnalysis, svd_method='foo') fa_fail = FactorAnalysis() fa_fail.svd_method = 'foo' assert_raises(ValueError, fa_fail.fit, X) fas = [] for method in ['randomized', 'lapack']: fa = FactorAnalysis(n_components=n_components, svd_method=method) fa.fit(X) fas.append(fa) X_t = fa.transform(X) assert_equal(X_t.shape, (n_samples, n_components)) assert_almost_equal(fa.loglike_[-1], fa.score(X).sum()) diff = np.all(np.diff(fa.loglike_)) assert_greater(diff, 0., 'Log likelihood dif not increase') # Sample Covariance scov = np.cov(X, rowvar=0., bias=1.) # Model Covariance mcov = fa.get_covariance() diff = np.sum(np.abs(scov - mcov)) / W.size assert_less(diff, 0.1, "Mean absolute difference is %f" % diff) fa = FactorAnalysis(n_components=n_components, noise_variance_init=np.ones(n_features)) assert_raises(ValueError, fa.fit, X[:, :2]) f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal fa1, fa2 = fas for attr in ['loglike_', 'components_', 'noise_variance_']: assert_almost_equal(f(fa1, attr), f(fa2, attr)) with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always', ConvergenceWarning) fa1.max_iter = 1 fa1.verbose = True fa1.fit(X) assert_true(w[-1].category == ConvergenceWarning) warnings.simplefilter('always', DeprecationWarning) FactorAnalysis(verbose=1) assert_true(w[-1].category == DeprecationWarning)
def test_random_projection_embedding_quality(): data, _ = make_sparse_random_data(8, 5000, 15000) eps = 0.2 original_distances = euclidean_distances(data, squared=True) original_distances = original_distances.ravel() non_identical = original_distances != 0.0 # remove 0 distances to avoid division by 0 original_distances = original_distances[non_identical] for RandomProjection in all_RandomProjection: rp = RandomProjection(n_components='auto', eps=eps, random_state=0) projected = rp.fit_transform(data) projected_distances = euclidean_distances(projected, squared=True) projected_distances = projected_distances.ravel() # remove 0 distances to avoid division by 0 projected_distances = projected_distances[non_identical] distances_ratio = projected_distances / original_distances # check that the automatically tuned values for the density respect the # contract for eps: pairwise distances are preserved according to the # Johnson-Lindenstrauss lemma assert_less(distances_ratio.max(), 1 + eps) assert_less(1 - eps, distances_ratio.min())
def check_importances(X, y, name, criterion): ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=20, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) # XXX: Remove this test in 0.19 after transform support to estimators # is removed. X_new = assert_warns( DeprecationWarning, est.transform, X, threshold="mean") assert_less(0 < X_new.shape[1], X.shape[1]) # Check with parallel importances = est.feature_importances_ est.set_params(n_jobs=2) importances_parrallel = est.feature_importances_ assert_array_almost_equal(importances, importances_parrallel) # Check with sample weights sample_weight = check_random_state(0).randint(1, 10, len(X)) est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=sample_weight) importances = est.feature_importances_ assert_true(np.all(importances >= 0.0)) for scale in [0.5, 10, 100]: est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert_less(np.abs(importances - importances_bis).mean(), 0.001)
def check_importances(name, criterion, X, y): ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=20, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) # Check with parallel importances = est.feature_importances_ est.set_params(n_jobs=2) importances_parrallel = est.feature_importances_ assert_array_almost_equal(importances, importances_parrallel) # Check with sample weights sample_weight = check_random_state(0).randint(1, 10, len(X)) est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=sample_weight) importances = est.feature_importances_ assert_true(np.all(importances >= 0.0)) for scale in [0.5, 10, 100]: est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert_less(np.abs(importances - importances_bis).mean(), 0.001)
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: if use_feature_hashing: expected_feature_importances = [0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379] else: expected_feature_importances = [0.07975691, 0.16122862, 0.75901447] expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, rtol=1e-2) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: expected_feature_importances = ([0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_oob_score_classification(): """Check that oob prediction is a good estimation of the generalization error.""" clf = RandomForestClassifier(oob_score=True, random_state=rng) n_samples = iris.data.shape[0] clf.fit(iris.data[: n_samples / 2, :], iris.target[: n_samples / 2]) test_score = clf.score(iris.data[n_samples / 2 :, :], iris.target[n_samples / 2 :]) assert_less(abs(test_score - clf.oob_score_), 0.05)
def test_sparse_coder_estimator(): n_atoms = 12 V = rng.randn(n_atoms, n_features) # random init V /= np.sum(V ** 2, axis=1)[:, np.newaxis] code = SparseCoder(dictionary=V, transform_algorithm='lasso_lars', transform_alpha=0.001).transform(X) assert_true(not np.all(code == 0)) assert_less(np.sqrt(np.sum((np.dot(code, V) - X) ** 2)), 0.1)
def test_sparse_encode_error(): n_components = 12 rng = np.random.RandomState(0) V = rng.randn(n_components, n_features) # random init V /= np.sum(V ** 2, axis=1)[:, np.newaxis] code = sparse_encode(X, V, alpha=0.001) assert not np.all(code == 0) assert_less(np.sqrt(np.sum((np.dot(code, V) - X) ** 2)), 0.1)
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([0.204, 0.172, 0.178, 0.212, 0.234] if use_feature_hashing else [0.262, 0.288, 0.45]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_sublinear_tf(): X = [[1], [2], [3]] tr = TfidfTransformer(sublinear_tf=True, use_idf=False, norm=None) tfidf = tr.fit_transform(X).toarray() assert_equal(tfidf[0], 1) assert_greater(tfidf[1], tfidf[0]) assert_greater(tfidf[2], tfidf[1]) assert_less(tfidf[1], 2) assert_less(tfidf[2], 3)
def test_regressor_mse(): y_bin = y.copy() y_bin[y != 1] = -1 for data in (X, X_csr): for fit_intercept in (True, False): for average in (False, True): reg = PassiveAggressiveRegressor(C=1.0, fit_intercept=fit_intercept, random_state=0, average=average, max_iter=5) reg.fit(data, y_bin) pred = reg.predict(data) assert_less(np.mean((pred - y_bin)**2), 1.7) if average: assert_true(hasattr(reg, 'average_coef_')) assert_true(hasattr(reg, 'average_intercept_')) assert_true(hasattr(reg, 'standard_intercept_')) assert_true(hasattr(reg, 'standard_coef_'))
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
def test_regressor_partial_fit(): y_bin = y.copy() y_bin[y != 1] = -1 for data in (X, X_csr): for average in (False, True): reg = PassiveAggressiveRegressor(C=1.0, fit_intercept=True, random_state=0, average=average, max_iter=100) for t in range(50): reg.partial_fit(data, y_bin) pred = reg.predict(data) assert_less(np.mean((pred - y_bin)**2), 1.7) if average: assert hasattr(reg, 'average_coef_') assert hasattr(reg, 'average_intercept_') assert hasattr(reg, 'standard_intercept_') assert hasattr(reg, 'standard_coef_')
def test_transform_linear_model(): for clf in (LogisticRegression(C=0.1), LinearSVC(C=0.01, dual=False), SGDClassifier(alpha=0.001, n_iter=50, shuffle=True, random_state=0)): for thresh in (None, ".09*mean", "1e-5 * median"): for func in (np.array, sp.csr_matrix): X = func(data) clf.set_params(penalty="l1") clf.fit(X, y) X_new = assert_warns( DeprecationWarning, clf.transform, X, thresh) if isinstance(clf, SGDClassifier): assert_true(X_new.shape[1] <= X.shape[1]) else: assert_less(X_new.shape[1], X.shape[1]) clf.set_params(penalty="l2") clf.fit(X_new, y) pred = clf.predict(X_new) assert_greater(np.mean(pred == y), 0.7)
def test_importances(): """Check variable importances.""" X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) clf = RandomForestClassifier(n_estimators=10) clf.fit(X, y) importances = clf.feature_importances_ n_important = sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) X_new = clf.transform(X, threshold="mean") assert_less(0 < X_new.shape[1], X.shape[1])
def test_collinearity(): """Check that lars_path is robust to collinearity in input""" X = np.array([[3., 3., 1.], [2., 2., 0.], [1., 1., 0]]) y = np.array([1., 0., 0]) _, _, coef_path_ = linear_model.lars_path(X, y, alpha_min=0.01) assert_true(not np.isnan(coef_path_).any()) residual = np.dot(X, coef_path_[:, -1]) - y assert_less((residual ** 2).sum(), 1.) # just make sure it's bounded n_samples = 10 X = np.random.rand(n_samples, 5) y = np.zeros(n_samples) _, _, coef_path_ = linear_model.lars_path(X, y, Gram='auto', copy_X=False, copy_Gram=False, alpha_min=0., method='lasso', verbose=0, max_iter=500) assert_array_almost_equal(coef_path_, np.zeros_like(coef_path_))
def test_ransac_max_trials(): base_estimator = LinearRegression() ransac_estimator = RANSACRegressor(base_estimator, min_samples=2, residual_threshold=5, max_trials=0, random_state=0) assert_raises(ValueError, ransac_estimator.fit, X, y) # there is a 1e-9 chance it will take these many trials. No good reason # 1e-2 isn't enough, can still happen # 2 is the what ransac defines as min_samples = X.shape[1] + 1 max_trials = _dynamic_max_trials( len(X) - len(outliers), X.shape[0], 2, 1 - 1e-9) ransac_estimator = RANSACRegressor(base_estimator, min_samples=2) for i in range(50): ransac_estimator.set_params(min_samples=2, random_state=i) ransac_estimator.fit(X, y) assert_less(ransac_estimator.n_trials_, max_trials + 1)
def test_KL_projection(): rng = np.random.RandomState(0) for poly in ( UnitCube(), ProbabilitySimplex(), Knapsack(max_labels=4), Knapsack(max_labels=4, min_labels=2), Birkhoff(max_iter=3000, tol=1e-5), CartesianProduct(ProbabilitySimplex()), #Permutahedron(), # not implemented #OrderSimplex(), # not implemented ): for func in ( rng.randn, rng.rand, ): if isinstance(poly, (Birkhoff, CartesianProduct)): theta = func(100) * 5 else: theta = func(10) * 5 sol = project_fw( theta, poly.argmax, #variant="pairwise", projection_type="KL", max_iter=1000, tol=1e-9) # Test correctness. sol_proj = poly.KL_project(theta) error = np.mean((sol - sol_proj)**2) assert_less(error, 1e-3) # Test vectorization. sol_proj2 = poly.KL_project([theta, theta]) assert_equal(len(sol_proj2), 2) assert_array_almost_equal(sol_proj2[0], sol_proj) assert_array_almost_equal(sol_proj2[1], sol_proj)
def test_lasso_lars_ic(): # Test the LassoLarsIC object by checking that # - some good features are selected. # - alpha_bic > alpha_aic # - n_nonzero_bic < n_nonzero_aic lars_bic = linear_model.LassoLarsIC('bic') lars_aic = linear_model.LassoLarsIC('aic') rng = np.random.RandomState(42) X = diabetes.data X = np.c_[X, rng.randn(X.shape[0], 5)] # add 5 bad features lars_bic.fit(X, y) lars_aic.fit(X, y) nonzero_bic = np.where(lars_bic.coef_)[0] nonzero_aic = np.where(lars_aic.coef_)[0] assert_greater(lars_bic.alpha_, lars_aic.alpha_) assert_less(len(nonzero_bic), len(nonzero_aic)) assert_less(np.max(nonzero_bic), diabetes.data.shape[1]) # test error on unknown IC lars_broken = linear_model.LassoLarsIC('<unknown>') assert_raises(ValueError, lars_broken.fit, X, y)
def test_make_matrix_words(self): """Test words in each doc are in the same topic """ n_topics = self.rand.randint(100, 200) words_per_topic = 30 words_per_doc = self.rand.randint(10, 20) params = { 'n_topics': n_topics, 'words_per_topic': words_per_topic, 'docs_per_topic': 100, 'words_per_doc': words_per_doc } matrix = make_doc_word_matrix(**params) dense = matrix.toarray() for i in xrange(dense.shape[0]): col_idx = np.where(dense[i, :] > 0)[0] max_idx = np.max(col_idx) min_idx = np.min(col_idx) assert_less(max_idx - min_idx, words_per_topic)
def test_feature_importances(): X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) for threshold, func in zip(["mean", "median"], [np.mean, np.median]): transformer = SelectFromModel(estimator=est, threshold=threshold) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, 'feature_importances_')) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) importances = transformer.estimator_.feature_importances_ feature_mask = np.abs(importances) > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask])
def test_stacked_regressor(self): bclf = LinearRegression() clfs = [RandomForestRegressor(n_estimators=50, random_state=1), GradientBoostingRegressor(n_estimators=25, random_state=1), Ridge(random_state=1)] # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] sr = StackedRegressor(bclf, clfs, n_folds=3, verbose=0, oob_score_flag=True) sr.fit(X_train, y_train) mse = mean_squared_error(y_test, sr.predict(X_test)) assert_less(mse, 6.0)
def test_nested_circles(): # Test the linear separability of the first 2D KPCA transform X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) # 2D nested circles are not linearly separable train_score = Perceptron().fit(X, y).score(X, y) assert_less(train_score, 0.8) # Project the circles data into the first 2 components of a RBF Kernel # PCA model. # Note that the gamma value is data dependent. If this test breaks # and the gamma value has to be updated, the Kernel PCA example will # have to be updated too. kpca = KernelPCA(kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.) X_kpca = kpca.fit_transform(X) # The data is perfectly linearly separable in that space train_score = Perceptron().fit(X_kpca, y).score(X_kpca, y) assert_equal(train_score, 1.0)
def check_oob_score(name, X, y, n_estimators=20): # Check that oob prediction is a good estimation of the generalization # error. # Proper behavior est = FOREST_ESTIMATORS[name](oob_score=True, random_state=0, n_estimators=n_estimators, bootstrap=True) n_samples = X.shape[0] est.fit(X[:n_samples // 2, :], y[:n_samples // 2]) test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:]) if name in FOREST_CLASSIFIERS: assert_less(abs(test_score - est.oob_score_), 0.1) else: assert_greater(test_score, est.oob_score_) assert_greater(est.oob_score_, .8) # Check warning if not enough estimators with np.errstate(divide="ignore", invalid="ignore"): est = FOREST_ESTIMATORS[name](oob_score=True, random_state=0, n_estimators=1, bootstrap=True) assert_warns(UserWarning, est.fit, X, y)
def test_rank_deficient_design(): # consistency test that checks that LARS Lasso is handling rank # deficient input data (with n_features < rank) in the same way # as coordinate descent Lasso y = [5, 0, 5] for X in ( [[5, 0], [0, 5], [10, 10]], [[10, 10, 0], [1e-32, 0, 0], [0, 0, 1]], ): # To be able to use the coefs to compute the objective function, # we need to turn off normalization lars = linear_model.LassoLars(.1, normalize=False) coef_lars_ = lars.fit(X, y).coef_ obj_lars = (1. / (2. * 3.) * linalg.norm(y - np.dot(X, coef_lars_))**2 + .1 * linalg.norm(coef_lars_, 1)) coord_descent = linear_model.Lasso(.1, tol=1e-6, normalize=False) coef_cd_ = coord_descent.fit(X, y).coef_ obj_cd = ((1. / (2. * 3.)) * linalg.norm(y - np.dot(X, coef_cd_))**2 + .1 * linalg.norm(coef_cd_, 1)) assert_less(obj_lars, obj_cd * (1. + 1e-8))
def test_lasso_lars_vs_lasso_cd(verbose=False): # Test that LassoLars and Lasso using coordinate descent give the # same results. X = 3 * diabetes.data alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso') lasso_cd = linear_model.Lasso(fit_intercept=False, tol=1e-8) for c, a in zip(lasso_path.T, alphas): if a == 0: continue lasso_cd.alpha = a lasso_cd.fit(X, y) error = linalg.norm(c - lasso_cd.coef_) assert_less(error, 0.01) # similar test, with the classifiers for alpha in np.linspace(1e-2, 1 - 1e-2, 20): clf1 = linear_model.LassoLars(alpha=alpha, normalize=False).fit(X, y) clf2 = linear_model.Lasso(alpha=alpha, tol=1e-8, normalize=False).fit(X, y) err = linalg.norm(clf1.coef_ - clf2.coef_) assert_less(err, 1e-3) # same test, with normalized data X = diabetes.data alphas, _, lasso_path = linear_model.lars_path(X, y, method='lasso') lasso_cd = linear_model.Lasso(fit_intercept=False, normalize=True, tol=1e-8) for c, a in zip(lasso_path.T, alphas): if a == 0: continue lasso_cd.alpha = a lasso_cd.fit(X, y) error = linalg.norm(c - lasso_cd.coef_) assert_less(error, 0.01)
def test_feature_importances(): X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) est = RandomForestClassifier(n_estimators=50, random_state=0) for threshold, func in zip(["mean", "median"], [np.mean, np.median]): transformer = SelectFromModel(estimator=est, threshold=threshold) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, 'feature_importances_')) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) importances = transformer.estimator_.feature_importances_ feature_mask = np.abs(importances) > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask]) # Check with sample weights sample_weight = np.ones(y.shape) sample_weight[y == 1] *= 100 est = RandomForestClassifier(n_estimators=50, random_state=0) transformer = SelectFromModel(estimator=est) transformer.fit(X, y, sample_weight=sample_weight) importances = transformer.estimator_.feature_importances_ transformer.fit(X, y, sample_weight=3 * sample_weight) importances_bis = transformer.estimator_.feature_importances_ assert_almost_equal(importances, importances_bis) # For the Lasso and related models, the threshold defaults to 1e-5 transformer = SelectFromModel(estimator=Lasso(alpha=0.1)) transformer.fit(X, y) X_new = transformer.transform(X) mask = np.abs(transformer.estimator_.coef_) > 1e-5 assert_array_equal(X_new, X[:, mask])
def test_lle_manifold(): rng = np.random.RandomState(0) # similar test on a slightly more complex manifold X = np.array(list(product(range(20), repeat=2))) X = np.c_[X, X[:, 0]**2 / 20] X = X + 1e-10 * rng.uniform(size=X.shape) n_components = 2 clf = manifold.LocallyLinearEmbedding(n_neighbors=5, n_components=n_components, random_state=0) tol = 1.5 N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray() reconstruction_error = np.linalg.norm(np.dot(N, X) - X) assert_less(reconstruction_error, tol) for solver in eigen_solvers: clf.set_params(eigen_solver=solver) clf.fit(X) assert_true(clf.embedding_.shape[1] == n_components) reconstruction_error = np.linalg.norm( np.dot(N, clf.embedding_) - clf.embedding_, 'fro')**2 details = "solver: " + solver assert_less(reconstruction_error, tol, msg=details) assert_less(np.abs(clf.reconstruction_error_ - reconstruction_error), tol * reconstruction_error, msg=details)
def test_lle_simple_grid(): rng = np.random.RandomState(0) # grid of equidistant points in 2D, n_components = n_dim X = np.array(list(product(range(5), repeat=2))) X = X + 1e-10 * rng.uniform(size=X.shape) n_components = 2 clf = manifold.LocallyLinearEmbedding(n_neighbors=5, n_components=n_components) tol = .1 N = barycenter_kneighbors_graph(X, clf.n_neighbors).todense() reconstruction_error = np.linalg.norm(np.dot(N, X) - X, 'fro') assert_less(reconstruction_error, tol) for solver in eigen_solvers: clf.set_params(eigen_solver=solver) clf.fit(X) assert_true(clf.embedding_.shape[1] == n_components) reconstruction_error = np.linalg.norm( np.dot(N, clf.embedding_) - clf.embedding_, 'fro')**2 # FIXME: ARPACK fails this test ... if solver != 'arpack': assert_less(reconstruction_error, tol) assert_almost_equal(clf.reconstruction_error_, reconstruction_error, decimal=4) # re-embed a noisy version of X using the transform method noise = rng.randn(*X.shape) / 100 X_reembedded = clf.transform(X + noise) assert_less(np.linalg.norm(X_reembedded - clf.embedding_), tol)
def test_lle_simple_grid(): # note: ARPACK is numerically unstable, so this test will fail for # some random seeds. We choose 2 because the tests pass. rng = np.random.RandomState(2) tol = 0.1 # grid of equidistant points in 2D, n_components = n_dim X = np.array(list(product(range(5), repeat=2))) X = X + 1e-10 * rng.uniform(size=X.shape) n_components = 2 clf = manifold.LocallyLinearEmbedding(n_neighbors=5, n_components=n_components, random_state=rng) tol = 0.1 N = barycenter_kneighbors_graph(X, clf.n_neighbors).todense() reconstruction_error = linalg.norm(np.dot(N, X) - X, 'fro') assert_less(reconstruction_error, tol) for solver in eigen_solvers: clf.set_params(eigen_solver=solver) clf.fit(X) assert_true(clf.embedding_.shape[1] == n_components) reconstruction_error = linalg.norm( np.dot(N, clf.embedding_) - clf.embedding_, 'fro')**2 assert_less(reconstruction_error, tol) assert_almost_equal(clf.reconstruction_error_, reconstruction_error, decimal=1) # re-embed a noisy version of X using the transform method noise = rng.randn(*X.shape) / 100 X_reembedded = clf.transform(X + noise) assert_less(linalg.norm(X_reembedded - clf.embedding_), tol)
def test_goodness_of_fit(): """Check the goodness of fit computation.""" for name, TreeEstimator in UNSCLF_TREES.items(): X = np.asarray(iris.data, dtype=np.float64) Xl = X[:X.shape[0] // 2] Xr = X[X.shape[0] // 2:] y = iris.target est1 = TreeEstimator(random_state=0) est2 = TreeEstimator(random_state=0) # check exceptions assert_raises(NotFittedError, est1.goodness_of_fit, X, "Failed to raise NotFittedError with {0}".format(name)) assert_raises(ValueError, est1.goodness_of_fit, y, "Failed to raise ValueError with {0}".format(name)) est1.fit(X) est2.fit(Xl) # check that using more training samples leads to a better fit assert_less(est2.goodness_of_fit(Xr, 'maximum'), est1.goodness_of_fit(Xr, 'maximum')) #"Failed `kolmogorov_smirnov` with {0}".format(name)) assert_less(est2.goodness_of_fit(Xr, 'mean_squared_error'), est1.goodness_of_fit(Xr, 'mean_squared_error')) #"Failed `mean_squared_error` with {0}".format(name)) assert_less(est2.goodness_of_fit(Xr, 'mean_squared_error_weighted'), est1.goodness_of_fit(Xr, 'mean_squared_error_weighted'))
def test_lle_manifold(): rng = np.random.RandomState(0) # similar test on a slightly more complex manifold X = np.array(list(product(np.arange(18), repeat=2))) X = np.c_[X, X[:, 0]**2 / 18] X = X + 1e-10 * rng.uniform(size=X.shape) n_components = 2 for method in ["standard", "hessian", "modified", "ltsa"]: clf = manifold.LocallyLinearEmbedding(n_neighbors=6, n_components=n_components, method=method, random_state=0) tol = 1.5 if method == "standard" else 3 N = barycenter_kneighbors_graph(X, clf.n_neighbors).toarray() reconstruction_error = linalg.norm(np.dot(N, X) - X) assert_less(reconstruction_error, tol) for solver in eigen_solvers: clf.set_params(eigen_solver=solver) clf.fit(X) assert_true(clf.embedding_.shape[1] == n_components) reconstruction_error = linalg.norm( np.dot(N, clf.embedding_) - clf.embedding_, 'fro')**2 details = ("solver: %s, method: %s" % (solver, method)) assert_less(reconstruction_error, tol, msg=details) assert_less(np.abs(clf.reconstruction_error_ - reconstruction_error), tol * reconstruction_error, msg=details)
def check_non_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data( num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # Note that we cannot check the feature weights here # since `model_params()` is not defined for non-linear # kernels. # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_randomized_pca_inverse(): # Test that randomized PCA is inversible on dense data rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) pca = PCA(n_components=2, svd_solver='randomized', random_state=0).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X, Y_inverse, decimal=2) # same as above with whitening (approximate reconstruction) pca = PCA(n_components=2, whiten=True, svd_solver='randomized', random_state=0).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (np.abs(X - Y_inverse) / np.abs(X).mean()).max() assert_less(relative_max_delta, 1e-5)
def test_importances(): """Check variable importances.""" X, y = datasets.make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) for name, Tree in CLF_TREES.items(): clf = Tree(random_state=0) clf.fit(X, y) importances = clf.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10, "Failed with {0}".format(name)) assert_equal(n_important, 3, "Failed with {0}".format(name)) X_new = clf.transform(X, threshold="mean") assert_less(0, X_new.shape[1], "Failed with {0}".format(name)) assert_less(X_new.shape[1], X.shape[1], "Failed with {0}".format(name))
def test_2d_coef(): X, y = datasets.make_classification( n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0, n_classes=4) est = LogisticRegression() for threshold, func in zip(["mean", "median"], [np.mean, np.median]): for order in [1, 2, np.inf]: # Fit SelectFromModel a multi-class problem transformer = SelectFromModel(estimator=LogisticRegression(), threshold=threshold, norm_order=order) transformer.fit(X, y) assert_true(hasattr(transformer.estimator_, 'coef_')) X_new = transformer.transform(X) assert_less(X_new.shape[1], X.shape[1]) # Manually check that the norm is correctly performed est.fit(X, y) importances = np.linalg.norm(est.coef_, axis=0, ord=order) feature_mask = importances > func(importances) assert_array_almost_equal(X_new, X[:, feature_mask])
def test_permutation_score(): iris = load_iris() X = iris.data X_sparse = coo_matrix(X) y = iris.target svm = SVC(kernel='linear') cv = cval.StratifiedKFold(y, 2) score, scores, pvalue = cval.permutation_test_score( svm, X, y, zero_one_score, cv) assert_greater(score, 0.9) np.testing.assert_almost_equal(pvalue, 0.0, 1) score_label, _, pvalue_label = cval.permutation_test_score( svm, X, y, zero_one_score, cv, labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # check that we obtain the same results with a sparse representation svm_sparse = SVC(kernel='linear') cv_sparse = cval.StratifiedKFold(y, 2, indices=True) score_label, _, pvalue_label = cval.permutation_test_score( svm_sparse, X_sparse, y, zero_one_score, cv_sparse, labels=np.ones(y.size), random_state=0) assert_true(score_label == score) assert_true(pvalue_label == pvalue) # set random y y = np.mod(np.arange(len(y)), 3) score, scores, pvalue = cval.permutation_test_score(svm, X, y, zero_one_score, cv) assert_less(score, 0.5) assert_greater(pvalue, 0.4)
def check_importances(name, criterion, X, y): ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=20, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) # XXX: Remove this test in 0.19 after transform support to estimators # is removed. X_new = assert_warns(DeprecationWarning, est.transform, X, threshold="mean") assert_less(0 < X_new.shape[1], X.shape[1]) # Check with parallel importances = est.feature_importances_ est.set_params(n_jobs=2) importances_parrallel = est.feature_importances_ assert_array_almost_equal(importances, importances_parrallel) # Check with sample weights sample_weight = check_random_state(0).randint(1, 10, len(X)) est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=sample_weight) importances = est.feature_importances_ assert_true(np.all(importances >= 0.0)) for scale in [0.5, 10, 100]: est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert_less(np.abs(importances - importances_bis).mean(), 0.001)
def check_importances(name, criterion, dtype, tolerance): # cast as dype X = X_large.astype(dtype, copy=False) y = y_large.astype(dtype, copy=False) ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=10, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ # The forest estimator can detect that only the first 3 features of the # dataset are informative: n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) assert np.all(importances[:3] > 0.1) # Check with parallel importances = est.feature_importances_ est.set_params(n_jobs=2) importances_parallel = est.feature_importances_ assert_array_almost_equal(importances, importances_parallel) # Check with sample weights sample_weight = check_random_state(0).randint(1, 10, len(X)) est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=sample_weight) importances = est.feature_importances_ assert np.all(importances >= 0.0) for scale in [0.5, 100]: est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert_less(np.abs(importances - importances_bis).mean(), tolerance)
def check_classification_synthetic(presort, loss): # Test GradientBoostingClassifier on synthetic dataset used by # Hastie et al. in ESLII Example 12.7. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=2, max_depth=1, loss=loss, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert_less(error_rate, 0.09) gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=2, max_depth=1, loss=loss, learning_rate=1.0, subsample=0.5, random_state=0, presort=presort) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert_less(error_rate, 0.08)