def test_cross_val_predict_input_types(): clf = Ridge() # Smoke test predictions = cval.cross_val_predict(clf, X, y) assert_equal(predictions.shape, (10,)) # test with multioutput y with ignore_warnings(category=ConvergenceWarning): predictions = cval.cross_val_predict(clf, X_sparse, X) assert_equal(predictions.shape, (10, 2)) predictions = cval.cross_val_predict(clf, X_sparse, y) assert_array_equal(predictions.shape, (10,)) # test with multioutput y with ignore_warnings(category=ConvergenceWarning): predictions = cval.cross_val_predict(clf, X_sparse, X) assert_array_equal(predictions.shape, (10, 2)) # test with X and y as list list_check = lambda x: isinstance(x, list) clf = CheckingClassifier(check_X=list_check) predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist()) clf = CheckingClassifier(check_y=list_check) predictions = cval.cross_val_predict(clf, X, y.tolist()) # test with 3d X and X_3d = X[:, :, np.newaxis] check_3d = lambda x: x.ndim == 3 clf = CheckingClassifier(check_X=check_3d) predictions = cval.cross_val_predict(clf, X_3d, y) assert_array_equal(predictions.shape, (10,))
def test_fit(): # Checks whether `fit` method sets all attribute values correctly. n_samples = 12 n_features = 2 n_estimators = 5 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( n_estimators=n_estimators) ignore_warnings(lshf.fit)(X) # _input_array = X assert_array_equal(X, lshf._fit_X) # A hash function g(p) for each tree assert_equal(n_estimators, len(lshf.hash_functions_)) # Hash length = 32 assert_equal(32, lshf.hash_functions_[0].components_.shape[0]) # Number of trees_ in the forest assert_equal(n_estimators, len(lshf.trees_)) # Each tree has entries for every data point assert_equal(n_samples, len(lshf.trees_[0])) # Original indices after sorting the hashes assert_equal(n_estimators, len(lshf.original_indices_)) # Each set of original indices in a tree has entries for every data point assert_equal(n_samples, len(lshf.original_indices_[0]))
def test_enet_l1_ratio(): # Test that an error message is raised if an estimator that # uses _alpha_grid is called with l1_ratio=0 msg = ("Automatic alpha grid generation is not supported for l1_ratio=0. " "Please supply a grid by providing your estimator with the " "appropriate `alphas=` argument.") X = np.array([[1, 2, 4, 5, 8], [3, 5, 7, 7, 8]]).T y = np.array([12, 10, 11, 21, 5]) assert_raise_message(ValueError, msg, ElasticNetCV( l1_ratio=0, random_state=42).fit, X, y) assert_raise_message(ValueError, msg, MultiTaskElasticNetCV( l1_ratio=0, random_state=42).fit, X, y[:, None]) # Test that l1_ratio=0 is allowed if we supply a grid manually alphas = [0.1, 10] estkwds = {'alphas': alphas, 'random_state': 42} est_desired = ElasticNetCV(l1_ratio=0.00001, **estkwds) est = ElasticNetCV(l1_ratio=0, **estkwds) with ignore_warnings(): est_desired.fit(X, y) est.fit(X, y) assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5) est_desired = MultiTaskElasticNetCV(l1_ratio=0.00001, **estkwds) est = MultiTaskElasticNetCV(l1_ratio=0, **estkwds) with ignore_warnings(): est.fit(X, y[:, None]) est_desired.fit(X, y[:, None]) assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)
def check_warm_start_oob(name): # Test that the warm start computes oob score when asked. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=True) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=False) clf_2.fit(X, y) clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15) clf_2.fit(X, y) assert_true(hasattr(clf_2, 'oob_score_')) assert_equal(clf.oob_score_, clf_2.oob_score_) # Test that oob_score is computed even if we don't need to train # additional trees. clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, random_state=1, bootstrap=True, oob_score=False) clf_3.fit(X, y) assert_true(not(hasattr(clf_3, 'oob_score_'))) clf_3.set_params(oob_score=True) ignore_warnings(clf_3.fit)(X, y) assert_equal(clf.oob_score_, clf_3.oob_score_)
def test_no_atoms(): y_empty = np.zeros_like(y) Xy_empty = np.dot(X.T, y_empty) gamma_empty = ignore_warnings(orthogonal_mp)(X, y_empty, 1) gamma_empty_gram = ignore_warnings(orthogonal_mp)(G, Xy_empty, 1) assert_equal(np.all(gamma_empty == 0), True) assert_equal(np.all(gamma_empty_gram == 0), True)
def test_neighbors_accuracy_with_n_estimators(): # Checks whether accuracy increases as `n_estimators` increases. n_estimators = np.array([1, 10, 100]) n_samples = 100 n_features = 10 n_iter = 10 n_points = 5 rng = np.random.RandomState(42) accuracies = np.zeros(n_estimators.shape[0], dtype=float) X = rng.rand(n_samples, n_features) for i, t in enumerate(n_estimators): lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( n_candidates=500, n_estimators=t) ignore_warnings(lshf.fit)(X) for j in range(n_iter): query = X[rng.randint(0, n_samples)].reshape(1, -1) neighbors = lshf.kneighbors(query, n_neighbors=n_points, return_distance=False) distances = pairwise_distances(query, X, metric='cosine') ranks = np.argsort(distances)[0, :n_points] intersection = np.intersect1d(ranks, neighbors).shape[0] ratio = intersection / float(n_points) accuracies[i] = accuracies[i] + ratio accuracies[i] = accuracies[i] / float(n_iter) # Sorted accuracies should be equal to original accuracies assert_true(np.all(np.diff(accuracies) >= 0), msg="Accuracies are not non-decreasing.") # Highest accuracy should be strictly greater than the lowest assert_true(np.ptp(accuracies) > 0, msg="Highest accuracy is not strictly greater than lowest.")
def test_check_estimator_clones(): # check that check_estimator doesn't modify the estimator it receives from sklearn.datasets import load_iris iris = load_iris() for Estimator in [GaussianMixture, LinearRegression, RandomForestClassifier, NMF, SGDClassifier, MiniBatchKMeans]: with ignore_warnings(category=FutureWarning): # when 'est = SGDClassifier()' est = Estimator() set_checking_parameters(est) set_random_state(est) # without fitting old_hash = joblib.hash(est) check_estimator(est) assert_equal(old_hash, joblib.hash(est)) with ignore_warnings(category=FutureWarning): # when 'est = SGDClassifier()' est = Estimator() set_checking_parameters(est) set_random_state(est) # with fitting est.fit(iris.data + 10, iris.target) old_hash = joblib.hash(est) check_estimator(est) assert_equal(old_hash, joblib.hash(est))
def test_hash_functions(): # Checks randomness of hash functions. # Variance and mean of each hash function (projection vector) # should be different from flattened array of hash functions. # If hash functions are not randomly built (seeded with # same value), variances and means of all functions are equal. n_samples = 12 n_features = 2 n_estimators = 5 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( n_estimators=n_estimators, random_state=rng.randint(0, np.iinfo(np.int32).max)) ignore_warnings(lshf.fit)(X) hash_functions = [] for i in range(n_estimators): hash_functions.append(lshf.hash_functions_[i].components_) for i in range(n_estimators): assert_not_equal(np.var(hash_functions), np.var(lshf.hash_functions_[i].components_)) for i in range(n_estimators): assert_not_equal(np.mean(hash_functions), np.mean(lshf.hash_functions_[i].components_))
def test_sparse_input(): # note: Fixed random state in sp.rand is not supported in older scipy. # The test should succeed regardless. X1 = sp.rand(50, 100) X2 = sp.rand(10, 100) forest_sparse = ignore_warnings(LSHForest, category=DeprecationWarning)( radius=1, random_state=0).fit(X1) forest_dense = ignore_warnings(LSHForest, category=DeprecationWarning)( radius=1, random_state=0).fit(X1.A) d_sparse, i_sparse = forest_sparse.kneighbors(X2, return_distance=True) d_dense, i_dense = forest_dense.kneighbors(X2.A, return_distance=True) assert_almost_equal(d_sparse, d_dense) assert_almost_equal(i_sparse, i_dense) d_sparse, i_sparse = forest_sparse.radius_neighbors(X2, return_distance=True) d_dense, i_dense = forest_dense.radius_neighbors(X2.A, return_distance=True) assert_equal(d_sparse.shape, d_dense.shape) for a, b in zip(d_sparse, d_dense): assert_almost_equal(a, b) for a, b in zip(i_sparse, i_dense): assert_almost_equal(a, b)
def test_enet_float_precision(): # Generate dataset X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10) # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests for normalize in [True, False]: for fit_intercept in [True, False]: coef = {} intercept = {} clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False, fit_intercept=fit_intercept, normalize=normalize) for dtype in [np.float64, np.float32]: X = dtype(X) y = dtype(y) ignore_warnings(clf.fit)(X, y) coef[dtype] = clf.coef_ intercept[dtype] = clf.intercept_ assert_equal(clf.coef_.dtype, dtype) assert_array_almost_equal(coef[np.float32], coef[np.float64], decimal=4) assert_array_almost_equal(intercept[np.float32], intercept[np.float64], decimal=4)
def test_enet_path(): # We use a large number of samples and of informative features so that # the l1_ratio selected is more toward ridge than lasso X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100, n_informative_features=100) max_iter = 150 # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have seleted an elastic-net # that is closer to ridge than to lasso assert_equal(clf.l1_ratio_, min(clf.l1_ratio)) clf = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter, precompute=True) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have seleted an elastic-net # that is closer to ridge than to lasso assert_equal(clf.l1_ratio_, min(clf.l1_ratio)) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert_greater(clf.score(X_test, y_test), 0.99)
def test_enet_toy_list_input(): # Test ElasticNet for various values of alpha and l1_ratio with list X X = np.array([[-1], [0], [1]]) X = sp.csc_matrix(X) Y = [-1, 0, 1] # just a straight line T = np.array([[2], [3], [4]]) # test sample # this should be the same as unregularized least squares clf = ElasticNet(alpha=0, l1_ratio=1.0) # catch warning about alpha=0. # this is discouraged but should work. ignore_warnings(clf.fit)(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=1000) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf = ElasticNet(alpha=0.5, l1_ratio=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.45454], 3) assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3) assert_almost_equal(clf.dual_gap_, 0)
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer("accuracy") rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) rfecv.fit(X, y) assert_equal(len(rfecv.grid_scores_), 6) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)
def test_radius_neighbors(): # Checks whether Returned distances are less than `radius` # At least one point should be returned when the `radius` is set # to mean distance from the considering point to other points in # the database. # Moreover, this test compares the radius neighbors of LSHForest # with the `sklearn.neighbors.NearestNeighbors`. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)() # Test unfitted estimator assert_raises(ValueError, lshf.radius_neighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): # Select a random point in the dataset as the query query = X[rng.randint(0, n_samples)].reshape(1, -1) # At least one neighbor should be returned when the radius is the # mean distance from the query to the points of the dataset. mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=False) assert_equal(neighbors.shape, (1,)) assert_equal(neighbors.dtype, object) assert_greater(neighbors[0].shape[0], 0) # All distances to points in the results of the radius query should # be less than mean_dist distances, neighbors = lshf.radius_neighbors(query, radius=mean_dist, return_distance=True) assert_array_less(distances[0], mean_dist) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.radius_neighbors(queries, return_distance=True) # dists and inds should not be 1D arrays or arrays of variable lengths # hence the use of the object dtype. assert_equal(distances.shape, (n_queries,)) assert_equal(distances.dtype, object) assert_equal(neighbors.shape, (n_queries,)) assert_equal(neighbors.dtype, object) # Compare with exact neighbor search query = X[rng.randint(0, n_samples)].reshape(1, -1) mean_dist = np.mean(pairwise_distances(query, X, metric='cosine')) nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist) distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
def test_kernel_ridge_singular_kernel(): # alpha=0 causes a LinAlgError in computing the dual coefficients, # which causes a fallback to a lstsq solver. This is tested here. pred = Ridge(alpha=0, fit_intercept=False).fit(X, y).predict(X) kr = KernelRidge(kernel="linear", alpha=0) ignore_warnings(kr.fit)(X, y) pred2 = kr.predict(X) assert_array_almost_equal(pred, pred2)
def test_consistent_proba(): a = svm.SVC(probability=True, max_iter=1, random_state=0) with ignore_warnings(category=ConvergenceWarning): proba_1 = a.fit(X, Y).predict_proba(X) a = svm.SVC(probability=True, max_iter=1, random_state=0) with ignore_warnings(category=ConvergenceWarning): proba_2 = a.fit(X, Y).predict_proba(X) assert_array_almost_equal(proba_1, proba_2)
def test_nans(): # Assert that SelectKBest and SelectPercentile can handle NaNs. # First feature has zero variance to confuse f_classif (ANOVA) and # make it return a NaN. X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]] y = [1, 0, 1] for select in (SelectKBest(f_classif, 2), SelectPercentile(f_classif, percentile=67)): ignore_warnings(select.fit)(X, y) assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
def _run_one_hot(X, X2, cat): # enc = assert_warns( # DeprecationWarning, # OneHotEncoder, categorical_features=cat) enc = OneHotEncoder(categorical_features=cat) with ignore_warnings(category=(DeprecationWarning, FutureWarning)): Xtr = enc.fit_transform(X) with ignore_warnings(category=(DeprecationWarning, FutureWarning)): X2tr = enc.fit(X).transform(X2) return Xtr, X2tr
def test_path_parameters(): X, y = make_sparse_data() max_iter = 50 n_alphas = 10 clf = ElasticNetCV(n_alphas=n_alphas, eps=1e-3, max_iter=max_iter, l1_ratio=0.5, fit_intercept=False) ignore_warnings(clf.fit)(X, y) # new params assert_almost_equal(0.5, clf.l1_ratio) assert_equal(n_alphas, clf.n_alphas) assert_equal(n_alphas, len(clf.alphas_)) sparse_mse_path = clf.mse_path_ ignore_warnings(clf.fit)(X.toarray(), y) # compare with dense data assert_almost_equal(clf.mse_path_, sparse_mse_path)
def test_enet_float_precision(): # Generate dataset X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10) # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests for normalize in [True, False]: for fit_intercept in [True, False]: coef = {} intercept = {} for dtype in [np.float64, np.float32]: clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False, fit_intercept=fit_intercept, normalize=normalize) X = dtype(X) y = dtype(y) ignore_warnings(clf.fit)(X, y) coef[('simple', dtype)] = clf.coef_ intercept[('simple', dtype)] = clf.intercept_ assert_equal(clf.coef_.dtype, dtype) # test precompute Gram array Gram = X.T.dot(X) clf_precompute = ElasticNet(alpha=0.5, max_iter=100, precompute=Gram, fit_intercept=fit_intercept, normalize=normalize) ignore_warnings(clf_precompute.fit)(X, y) assert_array_almost_equal(clf.coef_, clf_precompute.coef_) assert_array_almost_equal(clf.intercept_, clf_precompute.intercept_) # test multi task enet multi_y = np.hstack((y[:, np.newaxis], y[:, np.newaxis])) clf_multioutput = MultiTaskElasticNet( alpha=0.5, max_iter=100, fit_intercept=fit_intercept, normalize=normalize) clf_multioutput.fit(X, multi_y) coef[('multi', dtype)] = clf_multioutput.coef_ intercept[('multi', dtype)] = clf_multioutput.intercept_ assert_equal(clf.coef_.dtype, dtype) for v in ['simple', 'multi']: assert_array_almost_equal(coef[(v, np.float32)], coef[(v, np.float64)], decimal=4) assert_array_almost_equal(intercept[(v, np.float32)], intercept[(v, np.float64)], decimal=4)
def check_class_weight_balanced_and_bootstrap_multi_output(name): # Test class_weight works for multi-output""" ForestClassifier = FOREST_CLASSIFIERS[name] _y = np.vstack((y, np.array(y) * 2)).T clf = ForestClassifier(class_weight="balanced", random_state=0) clf.fit(X, _y) clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.0}, {-2: 1.0, 2: 1.0}], random_state=0) clf.fit(X, _y) # smoke test for subsample and balanced subsample clf = ForestClassifier(class_weight="balanced_subsample", random_state=0) clf.fit(X, _y) clf = ForestClassifier(class_weight="subsample", random_state=0) ignore_warnings(clf.fit)(X, _y)
def test_selectpercentile_tiebreaking(): # Test if SelectPercentile selects the right n_features in case of ties. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]] y = [1] dummy_score = lambda X, y: (X[0], X[0]) for X in Xs: sel = SelectPercentile(dummy_score, percentile=34) X1 = ignore_warnings(sel.fit_transform)([X], y) assert_equal(X1.shape[1], 1) assert_best_scores_kept(sel) sel = SelectPercentile(dummy_score, percentile=67) X2 = ignore_warnings(sel.fit_transform)([X], y) assert_equal(X2.shape[1], 2) assert_best_scores_kept(sel)
def test_kneighbors(): # Checks whether desired number of neighbors are returned. # It is guaranteed to return the requested number of neighbors # if `min_hash_match` is set to 0. Returned distances should be # in ascending order. n_samples = 12 n_features = 2 n_iter = 10 rng = np.random.RandomState(42) X = rng.rand(n_samples, n_features) lshf = ignore_warnings(LSHForest, category=DeprecationWarning)( min_hash_match=0) # Test unfitted estimator assert_raises(ValueError, lshf.kneighbors, X[0]) ignore_warnings(lshf.fit)(X) for i in range(n_iter): n_neighbors = rng.randint(0, n_samples) query = X[rng.randint(0, n_samples)].reshape(1, -1) neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors, return_distance=False) # Desired number of neighbors should be returned. assert_equal(neighbors.shape[1], n_neighbors) # Multiple points n_queries = 5 queries = X[rng.randint(0, n_samples, n_queries)] distances, neighbors = lshf.kneighbors(queries, n_neighbors=1, return_distance=True) assert_equal(neighbors.shape[0], n_queries) assert_equal(distances.shape[0], n_queries) # Test only neighbors neighbors = lshf.kneighbors(queries, n_neighbors=1, return_distance=False) assert_equal(neighbors.shape[0], n_queries) # Test random point(not in the data set) query = rng.randn(n_features).reshape(1, -1) lshf.kneighbors(query, n_neighbors=1, return_distance=False) # Test n_neighbors at initialization neighbors = lshf.kneighbors(query, return_distance=False) assert_equal(neighbors.shape[1], 5) # Test `neighbors` has an integer dtype assert_true(neighbors.dtype.kind == 'i', msg="neighbors are not in integer dtype.")
def test_train(self, params="wmc"): g = mixture.GMM(n_components=self.n_components, covariance_type=self.covariance_type) with ignore_warnings(category=DeprecationWarning): g.weights_ = self.weights g.means_ = self.means g.covars_ = 20 * self.covars[self.covariance_type] # Create a training set by sampling from the predefined distribution. with ignore_warnings(category=DeprecationWarning): X = g.sample(n_samples=100) g = self.model( n_components=self.n_components, covariance_type=self.covariance_type, random_state=rng, min_covar=1e-1, n_iter=1, init_params=params, ) g.fit(X) # Do one training iteration at a time so we can keep track of # the log likelihood to make sure that it increases after each # iteration. trainll = [] with ignore_warnings(category=DeprecationWarning): for _ in range(5): g.params = params g.init_params = "" g.fit(X) trainll.append(self.score(g, X)) g.n_iter = 10 g.init_params = "" g.params = params g.fit(X) # finish fitting # Note that the log likelihood will sometimes decrease by a # very small amount after it has more or less converged due to # the addition of min_covar to the covariance (to prevent # underflow). This is why the threshold is set to -0.5 # instead of 0. with ignore_warnings(category=DeprecationWarning): delta_min = np.diff(trainll).min() self.assertTrue( delta_min > self.threshold, "The min nll increase is %f which is lower than the admissible" " threshold of %f, for model %s. The likelihoods are %s." % (delta_min, self.threshold, self.covariance_type, trainll), )
def test_multilabel_hamming_loss(): # Dense label indicator matrix format y1 = np.array([[0, 1, 1], [1, 0, 1]]) y2 = np.array([[0, 0, 1], [1, 0, 1]]) assert_equal(hamming_loss(y1, y2), 1 / 6) assert_equal(hamming_loss(y1, y1), 0) assert_equal(hamming_loss(y2, y2), 0) assert_equal(hamming_loss(y2, np.logical_not(y2)), 1) assert_equal(hamming_loss(y1, np.logical_not(y1)), 1) assert_equal(hamming_loss(y1, np.zeros(y1.shape)), 4 / 6) assert_equal(hamming_loss(y2, np.zeros(y1.shape)), 0.5) with ignore_warnings(): # sequence of sequences is deprecated # List of tuple of label y1 = [(1, 2,), (0, 2,)] y2 = [(2,), (0, 2,)] assert_equal(hamming_loss(y1, y2), 1 / 6) assert_equal(hamming_loss(y1, y1), 0) assert_equal(hamming_loss(y2, y2), 0) assert_equal(hamming_loss(y2, [(), ()]), 0.75) assert_equal(hamming_loss(y1, [tuple(), (10, )]), 0.625) assert_almost_equal(hamming_loss(y2, [tuple(), (10, )], classes=np.arange(11)), 0.1818, 2)
def test_non_meta_estimators(name, Estimator, check): # Common tests for non-meta estimators with ignore_warnings(category=(DeprecationWarning, ConvergenceWarning, UserWarning, FutureWarning)): estimator = Estimator() set_checking_parameters(estimator) check(name, estimator)
def test_no_attributes_set_in_init(name, Estimator): # input validation etc for non-meta estimators with ignore_warnings(category=(DeprecationWarning, ConvergenceWarning, UserWarning, FutureWarning)): estimator = Estimator() # check this on class check_no_attributes_set_in_init(name, estimator)
def test_1d_input(name): X = iris.data[:, 0] X_2d = iris.data[:, 0].reshape((-1, 1)) y = iris.target with ignore_warnings(): check_1d_input(name, X, X_2d, y)
def test_warm_start(solver, warm_start, fit_intercept, multi_class): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. # Warm starting does not work with liblinear solver. X, y = iris.data, iris.target clf = LogisticRegression(tol=1e-4, multi_class=multi_class, warm_start=warm_start, solver=solver, random_state=42, max_iter=100, fit_intercept=fit_intercept) with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) coef_1 = clf.coef_ clf.max_iter = 1 clf.fit(X, y) cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) msg = ("Warm starting issue with %s solver in %s mode " "with fit_intercept=%s and warm_start=%s" % (solver, multi_class, str(fit_intercept), str(warm_start))) if warm_start: assert_greater(2.0, cum_diff, msg) else: assert_greater(cum_diff, 2.0, msg)
def test_multilabel_accuracy_score_subset_accuracy(): # Dense label indicator matrix format y1 = np.array([[0, 1, 1], [1, 0, 1]]) y2 = np.array([[0, 0, 1], [1, 0, 1]]) assert_equal(accuracy_score(y1, y2), 0.5) assert_equal(accuracy_score(y1, y1), 1) assert_equal(accuracy_score(y2, y2), 1) assert_equal(accuracy_score(y2, np.logical_not(y2)), 0) assert_equal(accuracy_score(y1, np.logical_not(y1)), 0) assert_equal(accuracy_score(y1, np.zeros(y1.shape)), 0) assert_equal(accuracy_score(y2, np.zeros(y1.shape)), 0) with ignore_warnings(): # sequence of sequences is deprecated # List of tuple of label y1 = [(1, 2,), (0, 2,)] y2 = [(2,), (0, 2,)] assert_equal(accuracy_score(y1, y2), 0.5) assert_equal(accuracy_score(y1, y1), 1) assert_equal(accuracy_score(y2, y2), 1) assert_equal(accuracy_score(y2, [(), ()]), 0) assert_equal(accuracy_score(y1, y2, normalize=False), 1) assert_equal(accuracy_score(y1, y1, normalize=False), 2) assert_equal(accuracy_score(y2, y2, normalize=False), 2) assert_equal(accuracy_score(y2, [(), ()], normalize=False), 0)
def test_warm_start(): X, y, _, _ = build_dataset() clf = ElasticNet(alpha=0.1, max_iter=5, warm_start=True) ignore_warnings(clf.fit)(X, y) ignore_warnings(clf.fit)(X, y) # do a second round with 5 iterations clf2 = ElasticNet(alpha=0.1, max_iter=10) ignore_warnings(clf2.fit)(X, y) assert_array_almost_equal(clf2.coef_, clf.coef_)
def test_lasso_lars_vs_lasso_cd_ill_conditioned(): # Test lasso lars on a very ill-conditioned design, and check that # it does not blow up, and stays somewhat close to a solution given # by the coordinate descent solver # Also test that lasso_path (using lars_path output style) gives # the same result as lars_path and previous lasso output style # under these conditions. rng = np.random.RandomState(42) # Generate data n, m = 70, 100 k = 5 X = rng.randn(n, m) w = np.zeros((m, 1)) i = np.arange(0, m) rng.shuffle(i) supp = i[:k] w[supp] = np.sign(rng.randn(k, 1)) * (rng.rand(k, 1) + 1) y = np.dot(X, w) sigma = 0.2 y += sigma * rng.rand(*y.shape) y = y.squeeze() f = assert_warns_message def in_warn_message(msg): return 'Early stopping' in msg or 'Dropping a regressor' in msg lars_alphas, _, lars_coef = f(ConvergenceWarning, in_warn_message, linear_model.lars_path, X, y, method='lasso') with ignore_warnings(): _, lasso_coef2, _ = linear_model.lasso_path(X, y, alphas=lars_alphas, tol=1e-6, fit_intercept=False) lasso_coef = np.zeros((w.shape[0], len(lars_alphas))) iter_models = enumerate(linear_model.lasso_path(X, y, alphas=lars_alphas, tol=1e-6, return_models=True, fit_intercept=False)) for i, model in iter_models: lasso_coef[:, i] = model.coef_ np.testing.assert_array_almost_equal(lars_coef, lasso_coef, decimal=1) np.testing.assert_array_almost_equal(lars_coef, lasso_coef2, decimal=1) np.testing.assert_array_almost_equal(lasso_coef, lasso_coef2, decimal=1)
def test_classification_invariance_string_vs_numbers_labels(name): # Ensure that classification metrics with string labels are invariant random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) y1_str = np.array(["eggs", "spam"])[y1] y2_str = np.array(["eggs", "spam"])[y2] pos_label_str = "spam" labels_str = ["eggs", "spam"] with ignore_warnings(): metric = CLASSIFICATION_METRICS[name] measure_with_number = metric(y1, y2) # Ugly, but handle case with a pos_label and label metric_str = metric if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) measure_with_str = metric_str(y1_str, y2_str) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number invariance " "test".format(name)) measure_with_strobj = metric_str(y1_str.astype('O'), y2_str.astype('O')) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string object vs number " "invariance test".format(name)) if name in METRICS_WITH_LABELS: metric_str = partial(metric_str, labels=labels_str) measure_with_str = metric_str(y1_str, y2_str) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number " "invariance test".format(name)) measure_with_strobj = metric_str(y1_str.astype('O'), y2_str.astype('O')) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string vs number " "invariance test".format(name))
def test_get_params_invariance(): # Test for estimators that support get_params, that # get_params(deep=False) is a subset of get_params(deep=True) # Related to issue #4465 estimators = all_estimators(include_meta_estimators=False, include_other=True) for name, Estimator in estimators: if hasattr(Estimator, 'get_params'): # If class is deprecated, ignore deprecated warnings if hasattr(Estimator.__init__, "deprecated_original"): with ignore_warnings(): yield check_get_params_invariance, name, Estimator else: yield check_get_params_invariance, name, Estimator
def test_roc_curve(): """Test Area under Receiver Operating Characteristic (ROC) curve""" y_true, _, probas_pred = make_prediction(binary=True) fpr, tpr, thresholds = roc_curve(y_true, probas_pred) roc_auc = auc(fpr, tpr) expected_auc = _auc(y_true, probas_pred) assert_array_almost_equal(roc_auc, expected_auc, decimal=2) assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred)) assert_almost_equal(roc_auc, ignore_warnings(auc_score)(y_true, probas_pred)) assert_equal(fpr.shape, tpr.shape) assert_equal(fpr.shape, thresholds.shape)
def test_verbose_sgd(): # Test verbose. X = [[3, 2], [1, 6]] y = [1, 0] clf = MLPClassifier(algorithm='sgd', max_iter=2, verbose=10, hidden_layer_sizes=2) old_stdout = sys.stdout sys.stdout = output = StringIO() with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) clf.partial_fit(X, y) sys.stdout = old_stdout assert 'Iteration' in output.getvalue()
def test_one_hot_encoder_dense(): # check for sparse=False X = [[3, 2, 1], [0, 1, 1]] enc = OneHotEncoder(sparse=False) with ignore_warnings(category=(DeprecationWarning, FutureWarning)): # discover max values automatically X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 5)) assert_array_equal(enc.active_features_, np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, np.array([[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]))
def test_non_meta_estimators(): # input validation etc for non-meta estimators estimators = all_estimators() for name, Estimator in estimators: if issubclass(Estimator, BiclusterMixin): continue if name.startswith("_"): continue for check in _yield_all_checks(name, Estimator): if issubclass(Estimator, ProjectedGradientNMF): # The ProjectedGradientNMF class is deprecated with ignore_warnings(): yield check, name, Estimator else: yield check, name, Estimator
def test_get_params_invariance(): # Test for estimators that support get_params, that # get_params(deep=False) is a subset of get_params(deep=True) # Related to issue #4465 estimators = all_estimators(include_meta_estimators=False, include_other=True) for name, Estimator in estimators: if hasattr(Estimator, 'get_params'): # The ProjectedGradientNMF class is deprecated if issubclass(Estimator, ProjectedGradientNMF): with ignore_warnings(): yield check_get_params_invariance, name, Estimator else: yield check_get_params_invariance, name, Estimator
def test_ovr_exceptions(): ovr = OneVsRestClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ovr.predict, []) with ignore_warnings(): assert_raises(ValueError, predict_ovr, [LinearSVC(), MultinomialNB()], LabelBinarizer(), []) # Fail on multioutput data assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit, np.array([[1, 0], [0, 1]]), np.array([[1, 2], [3, 1]])) assert_raises(ValueError, OneVsRestClassifier(MultinomialNB()).fit, np.array([[1, 0], [0, 1]]), np.array([[1.5, 2.4], [3.1, 0.8]]))
def test_neighbors_accuracy_with_n_candidates(): # Checks whether accuracy increases as `n_candidates` increases. n_candidates_values = np.array([.1, 50, 500]) n_samples = 100 n_features = 10 n_iter = 10 n_points = 5 rng = np.random.RandomState(42) accuracies = np.zeros(n_candidates_values.shape[0], dtype=float) X = rng.rand(n_samples, n_features) for i, n_candidates in enumerate(n_candidates_values): lshf = ignore_warnings( LSHForest, category=DeprecationWarning)(n_candidates=n_candidates) ignore_warnings(lshf.fit)(X) for j in range(n_iter): query = X[rng.randint(0, n_samples)].reshape(1, -1) neighbors = lshf.kneighbors(query, n_neighbors=n_points, return_distance=False) distances = pairwise_distances(query, X, metric='cosine') ranks = np.argsort(distances)[0, :n_points] intersection = np.intersect1d(ranks, neighbors).shape[0] ratio = intersection / float(n_points) accuracies[i] = accuracies[i] + ratio accuracies[i] = accuracies[i] / float(n_iter) # Sorted accuracies should be equal to original accuracies print('accuracies:', accuracies) assert_true(np.all(np.diff(accuracies) >= 0), msg="Accuracies are not non-decreasing.") # Highest accuracy should be strictly greater than the lowest assert_true(np.ptp(accuracies) > 0, msg="Highest accuracy is not strictly greater than lowest.")
def test_iforest(): """Check Isolation Forest for various parameter settings.""" X_train = np.array([[0, 1], [1, 2]]) X_test = np.array([[2, 1], [1, 1]]) grid = ParameterGrid({ "n_estimators": [3], "max_samples": [0.5, 1.0, 3], "bootstrap": [True, False] }) with ignore_warnings(): for params in grid: IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test)
def test_candidates(): # Checks whether candidates are sufficient. # This should handle the cases when number of candidates is 0. # User should be warned when number of candidates is less than # requested number of neighbors. X_train = np.array([[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1], [6, 10, 2]], dtype=np.float32) X_test = np.array([7, 10, 3], dtype=np.float32).reshape(1, -1) # For zero candidates lshf = LSHForest(min_hash_match=32) ignore_warnings(lshf.fit)(X_train) message = ("Number of candidates is not sufficient to retrieve" " %i neighbors with" " min_hash_match = %i. Candidates are filled up" " uniformly from unselected" " indices." % (3, 32)) assert_warns_message(UserWarning, message, lshf.kneighbors, X_test, n_neighbors=3) distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3) assert_equal(distances.shape[1], 3) # For candidates less than n_neighbors lshf = LSHForest(min_hash_match=31) ignore_warnings(lshf.fit)(X_train) message = ("Number of candidates is not sufficient to retrieve" " %i neighbors with" " min_hash_match = %i. Candidates are filled up" " uniformly from unselected" " indices." % (5, 31)) assert_warns_message(UserWarning, message, lshf.kneighbors, X_test, n_neighbors=5) distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5) assert_equal(distances.shape[1], 5)
def test_1d_1component(): # Test all of the covariance_types return the same BIC score for # 1-dimensional, 1 component fits. n_samples, n_dim, n_components = 100, 1, 1 X = rng.randn(n_samples, n_dim) g_full = mixture.GMM(n_components=n_components, covariance_type='full', random_state=rng, min_covar=1e-7, n_iter=1) with ignore_warnings(category=DeprecationWarning): g_full.fit(X) g_full_bic = g_full.bic(X) for cv_type in ['tied', 'diag', 'spherical']: g = mixture.GMM(n_components=n_components, covariance_type=cv_type, random_state=rng, min_covar=1e-7, n_iter=1) g.fit(X) assert_array_almost_equal(g.bic(X), g_full_bic)
def test_sparse_matrices(): # Test that sparse and dense input matrices output the same results.""" X = X_digits_binary[:50] y = y_digits_binary[:50] X_sparse = csr_matrix(X) mlp = MLPClassifier(random_state=1, hidden_layer_sizes=15) with ignore_warnings(category=ConvergenceWarning): mlp.fit(X, y) pred1 = mlp.decision_function(X) mlp.fit(X_sparse, y) pred2 = mlp.decision_function(X_sparse) assert_almost_equal(pred1, pred2) pred1 = mlp.predict(X) pred2 = mlp.predict(X_sparse) assert_array_equal(pred1, pred2)
def test_warm_start_multitask_lasso(): X, y, X_test, y_test = build_dataset() Y = np.c_[y, y] clf = MultiTaskLasso(alpha=0.1, max_iter=5, warm_start=True) ignore_warnings(clf.fit)(X, Y) ignore_warnings(clf.fit)(X, Y) # do a second round with 5 iterations clf2 = MultiTaskLasso(alpha=0.1, max_iter=10) ignore_warnings(clf2.fit)(X, Y) assert_array_almost_equal(clf2.coef_, clf.coef_)
def test_neighbors_badargs(): """Test bad argument values: these should all raise ValueErrors""" assert_raises(ValueError, neighbors.NearestNeighbors, algorithm='blah') X = rng.random_sample((10, 2)) Xsparse = csr_matrix(X) y = np.ones(10) for cls in (neighbors.KNeighborsClassifier, neighbors.RadiusNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.RadiusNeighborsRegressor): assert_raises(ValueError, cls, weights='blah') assert_raises(ValueError, cls, p=-1) assert_raises(ValueError, cls, algorithm='blah') nbrs = cls(algorithm='ball_tree', metric='haversine') assert_raises(ValueError, nbrs.predict, X) assert_raises(ValueError, ignore_warnings(nbrs.fit), Xsparse, y) nbrs = cls() assert_raises(ValueError, nbrs.fit, np.ones((0, 2)), np.ones(0)) assert_raises(ValueError, nbrs.fit, X[:, :, None], y) nbrs.fit(X, y) assert_raises(ValueError, nbrs.predict, []) nbrs = neighbors.NearestNeighbors().fit(X) assert_raises(ValueError, nbrs.kneighbors_graph, X, mode='blah') assert_raises(ValueError, nbrs.radius_neighbors_graph, X, mode='blah')
def do_generate_rf_optimazed_model(X_train, y_train, parameters): file_operations.write_logs( FILENAME, 'Starting RF Grid Search with parameters:' + str(parameters)) model = RandomForestClassifier(random_state=my_constants.RANDOM_VALUE, oob_score=True) model_grid = GridSearchCV(model, param_grid=parameters, cv=3, verbose=3, n_jobs=3) with ignore_warnings(category=ConvergenceWarning): model_grid.fit(X_train, y_train) file_operations.write_logs(FILENAME, 'RF Grid search completed') return model_grid
def test_enet_path(): # We use a large number of samples and of informative features so that # the l1_ratio selected is more toward ridge than lasso X, y, X_test, y_test = build_dataset(n_samples=200, n_features=100, n_informative_features=100) max_iter = 150 # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have selected an elastic-net # that is closer to ridge than to lasso assert_equal(clf.l1_ratio_, min(clf.l1_ratio)) clf = ElasticNetCV(alphas=[0.01, 0.05, 0.1], eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter, precompute=True) ignore_warnings(clf.fit)(X, y) # Well-conditioned settings, we should have selected our # smallest penalty assert_almost_equal(clf.alpha_, min(clf.alphas_)) # Non-sparse ground truth: we should have selected an elastic-net # that is closer to ridge than to lasso assert_equal(clf.l1_ratio_, min(clf.l1_ratio)) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert_greater(clf.score(X_test, y_test), 0.99) # Multi-output/target case X, y, X_test, y_test = build_dataset(n_features=10, n_targets=3) clf = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7], cv=3, max_iter=max_iter) ignore_warnings(clf.fit)(X, y) # We are in well-conditioned settings with low noise: we should # have a good test-set performance assert_greater(clf.score(X_test, y_test), 0.99) assert_equal(clf.coef_.shape, (3, 10)) # Mono-output should have same cross-validated alpha_ and l1_ratio_ # in both cases. X, y, _, _ = build_dataset(n_features=10) clf1 = ElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf1.fit(X, y) clf2 = MultiTaskElasticNetCV(n_alphas=5, eps=2e-3, l1_ratio=[0.5, 0.7]) clf2.fit(X, y[:, np.newaxis]) assert_almost_equal(clf1.l1_ratio_, clf2.l1_ratio_) assert_almost_equal(clf1.alpha_, clf2.alpha_)
def test_train_degenerate(self, params='wmc'): # Train on degenerate data with 0 in some dimensions # Create a training set by sampling from the predefined # distribution. X = rng.randn(100, self.n_features) X.T[1:] = 0 g = self.model(n_components=2, covariance_type=self.covariance_type, random_state=rng, min_covar=1e-3, n_iter=5, init_params=params) with ignore_warnings(category=DeprecationWarning): g.fit(X) trainll = g.score(X) self.assertTrue(np.sum(np.abs(trainll / 100 / X.shape[1])) < 5)
def test_multilabel_jaccard_similarity_score(): # Dense label indicator matrix format y1 = np.array([[0, 1, 1], [1, 0, 1]]) y2 = np.array([[0, 0, 1], [1, 0, 1]]) # size(y1 \inter y2) = [1, 2] # size(y1 \union y2) = [2, 2] assert_equal(jaccard_similarity_score(y1, y2), 0.75) assert_equal(jaccard_similarity_score(y1, y1), 1) assert_equal(jaccard_similarity_score(y2, y2), 1) assert_equal(jaccard_similarity_score(y2, np.logical_not(y2)), 0) assert_equal(jaccard_similarity_score(y1, np.logical_not(y1)), 0) assert_equal(jaccard_similarity_score(y1, np.zeros(y1.shape)), 0) assert_equal(jaccard_similarity_score(y2, np.zeros(y1.shape)), 0) with ignore_warnings(): # sequence of sequences is deprecated # List of tuple of label y1 = [( 1, 2, ), ( 0, 2, )] y2 = [(2, ), ( 0, 2, )] assert_equal(jaccard_similarity_score(y1, y2), 0.75) assert_equal(jaccard_similarity_score(y1, y1), 1) assert_equal(jaccard_similarity_score(y2, y2), 1) assert_equal(jaccard_similarity_score(y2, [(), ()]), 0) # |y3 inter y4 | = [0, 1, 1] # |y3 union y4 | = [2, 1, 3] y3 = [(0, ), (1, ), (3, )] y4 = [(4, ), (4, ), (5, 6)] assert_almost_equal(jaccard_similarity_score(y3, y4), 0) # |y5 inter y6 | = [0, 1, 1] # |y5 union y6 | = [2, 1, 3] y5 = [(0, ), (1, ), (2, 3)] y6 = [(1, ), (1, ), (2, 0)] assert_almost_equal(jaccard_similarity_score(y5, y6), (1 + 1 / 3) / 3)
def test_multiclass_multioutput(Estimator): # Make sure error is raised for multiclass-multioutput classifiers # make multiclass-multioutput dataset X, y = make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) y = np.array([y, y]).T est = Estimator() with ignore_warnings(category=FutureWarning): est.fit(X, y) with pytest.raises( ValueError, match="Multiclass-multioutput estimators are not supported"): partial_dependence(est, X, [0])
def train(self, contexts, responses): """Fit the tf-idf transform and compute idf statistics.""" with ignore_warnings(): # Ignore deprecated `non_negative` warning. self._vectorizer = HashingVectorizer(non_negative=True) self._tfidf_transform = TfidfTransformer() count_matrix = self._tfidf_transform.fit_transform( self._vectorizer.transform(contexts + responses)) n_samples, n_features = count_matrix.shape df = _document_frequency(count_matrix) idf = np.log((n_samples - df + 0.5) / (df + 0.5)) self._idf_diag = sp.spdiags( idf, diags=0, m=n_features, n=n_features ) document_lengths = count_matrix.sum(axis=1) self._average_document_length = np.mean(document_lengths) print(self._average_document_length)
def test_train_1d(self, params='wmc'): # Train on 1-D data # Create a training set by sampling from the predefined # distribution. X = rng.randn(100, 1) # X.T[1:] = 0 g = self.model(n_components=2, covariance_type=self.covariance_type, random_state=rng, min_covar=1e-7, n_iter=5, init_params=params) with ignore_warnings(category=DeprecationWarning): g.fit(X) trainll = g.score(X) if isinstance(g, mixture.DPGMM): self.assertTrue(np.sum(np.abs(trainll / 100)) < 5) else: self.assertTrue(np.sum(np.abs(trainll / 100)) < 2)
def test_logistic_regression_sample_weights(): X, y = make_classification(n_samples=20, n_features=5, n_informative=3, n_classes=2, random_state=0) sample_weight = y + 1 for LR in [LogisticRegression, LogisticRegressionCV]: # Test that liblinear fails when sample weights are provided clf_lib = LR(solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y, sample_weight=np.ones(y.shape[0])) # Test that passing sample_weight as ones is the same as # not passing them at all (default None) clf_sw_none = LR(solver='lbfgs', fit_intercept=False) clf_sw_none.fit(X, y) clf_sw_ones = LR(solver='lbfgs', fit_intercept=False) clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0])) assert_array_almost_equal(clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4) # Test that sample weights work the same with the lbfgs, # newton-cg, and 'sag' solvers clf_sw_lbfgs = LR(solver='lbfgs', fit_intercept=False) clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight) clf_sw_n = LR(solver='newton-cg', fit_intercept=False) clf_sw_n.fit(X, y, sample_weight=sample_weight) clf_sw_sag = LR(solver='sag', fit_intercept=False, tol=1e-10) # ignore convergence warning due to small dataset with ignore_warnings(): clf_sw_sag.fit(X, y, sample_weight=sample_weight) assert_array_almost_equal( clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4) assert_array_almost_equal( clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4) # Test that passing class_weight as [1,2] is the same as # passing class weight = [1,1] but adjusting sample weights # to be 2 for all instances of class 2 clf_cw_12 = LR(solver='lbfgs', fit_intercept=False, class_weight={0: 1, 1: 2}) clf_cw_12.fit(X, y) sample_weight = np.ones(y.shape[0]) sample_weight[y == 1] = 2 clf_sw_12 = LR(solver='lbfgs', fit_intercept=False) clf_sw_12.fit(X, y, sample_weight=sample_weight) assert_array_almost_equal(clf_cw_12.coef_, clf_sw_12.coef_, decimal=4)
def test_alpha(): # Test that larger alpha yields weights closer to zero""" X = X_digits_binary[:100] y = y_digits_binary[:100] alpha_vectors = [] alpha_values = np.arange(2) absolute_sum = lambda x: np.sum(np.abs(x)) for alpha in alpha_values: mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1) with ignore_warnings(category=ConvergenceWarning): mlp.fit(X, y) alpha_vectors.append(np.array([absolute_sum(mlp.coefs_[0]), absolute_sum(mlp.coefs_[1])])) for i in range(len(alpha_values) - 1): assert (alpha_vectors[i] > alpha_vectors[i + 1]).all()
def _compute_score(self, X, y=None): _, n_features = X.shape score = np.empty(len(self.n_clusters_range)) for k, n_clusters in enumerate(self.n_clusters_range): self.estimator.set_params(n_clusters=n_clusters) score[k] = self.estimator.fit(X).score(X) weights = 1. - np.exp((self.n_clusters_range[:-1] - 1) * np.log(5. / 6.) + np.log(.75) - np.log(n_features)) with ignore_warnings(category=RuntimeWarning): score[1:] = np.where(score[:-1] != 0., score[1:] / (weights * score[:-1]), 1.) score[0] = 1. return -score
def train_model(folds, model): """ Evaluation with: Matthews correlation coefficient: represents thresholding measures AUC: represents ranking measures Brier score: represents calibration measures """ scores = [] fit_model_time = 0 # Sum of all the time spend on fitting the training data, later on normalized score_model_time = 0 # Sum of all the time spend on scoring the testing data, later on normalized for X_train, y_train, X_test, y_test in folds: # Training start_time = time.time() with ignore_warnings(category=ConvergenceWarning ): # Yes, neural networks do not always converge model.fit(X_train, y_train) fit_model_time += time.time() - start_time prediction_train_proba = model.predict_proba(X_train)[:, 1] prediction_train = (prediction_train_proba >= 0.5).astype('uint8') # Testing start_time = time.time() prediction_test_proba = model.predict_proba(X_test)[:, 1] score_model_time += time.time() - start_time prediction_test = (prediction_test_proba >= 0.5).astype('uint8') # When all the predictions are of a single class, we get a RuntimeWarning in matthews_corr with warnings.catch_warnings(): warnings.simplefilter("ignore") scores.append([ sklearn.metrics.matthews_corrcoef(y_test, prediction_test), sklearn.metrics.matthews_corrcoef(y_train, prediction_train), sklearn.metrics.roc_auc_score(y_test, prediction_test_proba), sklearn.metrics.roc_auc_score(y_train, prediction_train_proba), sklearn.metrics.brier_score_loss(y_test, prediction_test_proba), sklearn.metrics.brier_score_loss(y_train, prediction_train_proba) ]) return np.mean( scores, axis=0), fit_model_time / len(folds), score_model_time / len(folds)
def test_same_output_sparse_dense_lasso_and_enet_cv(): X, y = make_sparse_data(n_samples=40, n_features=10) for normalize in [True, False]: clfs = ElasticNetCV(max_iter=100, cv=5, normalize=normalize) ignore_warnings(clfs.fit)(X, y) clfd = ElasticNetCV(max_iter=100, cv=5, normalize=normalize) ignore_warnings(clfd.fit)(X.toarray(), y) assert_almost_equal(clfs.alpha_, clfd.alpha_, 7) assert_almost_equal(clfs.intercept_, clfd.intercept_, 7) assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_) assert_array_almost_equal(clfs.alphas_, clfd.alphas_) clfs = LassoCV(max_iter=100, cv=4, normalize=normalize) ignore_warnings(clfs.fit)(X, y) clfd = LassoCV(max_iter=100, cv=4, normalize=normalize) ignore_warnings(clfd.fit)(X.toarray(), y) assert_almost_equal(clfs.alpha_, clfd.alpha_, 7) assert_almost_equal(clfs.intercept_, clfd.intercept_, 7) assert_array_almost_equal(clfs.mse_path_, clfd.mse_path_) assert_array_almost_equal(clfs.alphas_, clfd.alphas_)
def test_scorer_memmap_input(name): # Non-regression test for #6147: some score functions would # return singleton memmap when computed on memmap data instead of scalar # float values. if name in REQUIRE_POSITIVE_Y_SCORERS: y_mm_1 = _require_positive_y(y_mm) y_ml_mm_1 = _require_positive_y(y_ml_mm) else: y_mm_1, y_ml_mm_1 = y_mm, y_ml_mm # UndefinedMetricWarning for P / R scores with ignore_warnings(): scorer, estimator = SCORERS[name], ESTIMATORS[name] if name in MULTILABEL_ONLY_SCORERS: score = scorer(estimator, X_mm, y_ml_mm_1) else: score = scorer(estimator, X_mm, y_mm_1) assert isinstance(score, numbers.Number), name