def test_randomized_svd_transpose_consistency(): """Check that transposing the design matrix has limit impact""" n_samples = 100 n_features = 500 rank = 4 k = 10 X = make_low_rank_matrix(n_samples=n_samples, n_features=n_features, effective_rank=rank, tail_strength=0.5, random_state=0) assert_equal(X.shape, (n_samples, n_features)) U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0) U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0) U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose='auto', random_state=0) U4, s4, V4 = linalg.svd(X, full_matrices=False) assert_almost_equal(s1, s4[:k], decimal=3) assert_almost_equal(s2, s4[:k], decimal=3) assert_almost_equal(s3, s4[:k], decimal=3) assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2) assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2) # in this case 'auto' is equivalent to transpose assert_almost_equal(s2, s3)
def test_RadiusNeighborsRegressor_multioutput(n_samples=40, n_features=5, n_test_pts=10, n_neighbors=3, random_state=0): """Test k-neighbors in multi-output regression with various weight""" rng = np.random.RandomState(random_state) X = 2 * rng.rand(n_samples, n_features) - 1 y = np.sqrt((X ** 2).sum(1)) y /= y.max() y = np.vstack([y, y]).T y_target = y[:n_test_pts] weights = ['uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): rnn = neighbors.RadiusNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm) rnn.fit(X, y) epsilon = 1E-5 * (2 * rng.rand(1, n_features) - 1) y_pred = rnn.predict(X[:n_test_pts] + epsilon) assert_equal(y_pred.shape, y_target.shape) assert_true(np.all(np.abs(y_pred - y_target) < 0.3))
def test_RadiusNeighborsClassifier_multioutput(): """Test k-NN classifier on multioutput data""" rng = check_random_state(0) n_features = 2 n_samples = 40 n_output = 3 X = rng.rand(n_samples, n_features) y = rng.randint(0, 3, (n_samples, n_output)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) weights = [None, 'uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): # Stack single output prediction y_pred_so = [] for o in range(n_output): rnn = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train[:, o]) y_pred_so.append(rnn.predict(X_test)) y_pred_so = np.vstack(y_pred_so).T assert_equal(y_pred_so.shape, y_test.shape) # Multioutput prediction rnn_mo = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn_mo.fit(X_train, y_train) y_pred_mo = rnn_mo.predict(X_test) assert_equal(y_pred_mo.shape, y_test.shape) assert_array_almost_equal(y_pred_mo, y_pred_so)
def test_kfold_valueerrors(): # Check that errors are raised if there is not enough samples assert_raises(ValueError, cval.KFold, 3, 4) # Check that a warning is raised if the least populated class has too few # members. with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') y = [3, 3, -1, -1, 2] cv = cval.StratifiedKFold(y, 3) # checking there was only one warning. assert_equal(len(w), 1) # checking it has the right type assert_equal(w[0].category, Warning) # checking it's the right warning. This might be a bad test since it's # a characteristic of the code and not a behavior assert_true("The least populated class" in str(w[0])) # Check that despite the warning the folds are still computed even # though all the classes are not necessarily represented at on each # side of the split at each split check_cv_coverage(cv, expected_n_iter=3, n_samples=len(y)) # Error when number of folds is <= 1 assert_raises(ValueError, cval.KFold, 2, 0) assert_raises(ValueError, cval.KFold, 2, 1) assert_raises(ValueError, cval.StratifiedKFold, y, 0) assert_raises(ValueError, cval.StratifiedKFold, y, 1) # When n is not integer: assert_raises(ValueError, cval.KFold, 2.5, 2) # When n_folds is not integer: assert_raises(ValueError, cval.KFold, 5, 1.5) assert_raises(ValueError, cval.StratifiedKFold, y, 1.5)
def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(): """Test radius neighbors in multi-output regression (uniform weight)""" rng = check_random_state(0) n_features = 5 n_samples = 40 n_output = 4 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_output) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for algorithm, weights in product(ALGORITHMS, [None, 'uniform']): rnn = neighbors. RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train) neigh_idx = rnn.radius_neighbors(X_test, return_distance=False) y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) y_pred_idx = np.array(y_pred_idx) y_pred = rnn.predict(X_test) assert_equal(y_pred_idx.shape, y_test.shape) assert_equal(y_pred.shape, y_test.shape) assert_array_almost_equal(y_pred, y_pred_idx)
def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert_equal(C, C2) # Smoke test the score #np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert_equal(C, C3) assert_array_equal(y_pred, y_pred3)
def test_deprecated_score_func(): # test that old deprecated way of passing a score / loss function is still # supported X, y = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X[:180], y[:180]) y_pred = cv.predict(X[180:]) C = cv.best_estimator_.C clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, score_func=f1_score) with warnings.catch_warnings(record=True): # catch deprecation warning cv.fit(X[:180], y[:180]) y_pred_func = cv.predict(X[180:]) C_func = cv.best_estimator_.C assert_array_equal(y_pred, y_pred_func) assert_equal(C, C_func) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) clf = LinearSVC(random_state=0) cv = GridSearchCV(clf, {'C': [0.1, 1.0]}, loss_func=f1_loss) with warnings.catch_warnings(record=True): # catch deprecation warning cv.fit(X[:180], y[:180]) y_pred_loss = cv.predict(X[180:]) C_loss = cv.best_estimator_.C assert_array_equal(y_pred, y_pred_loss) assert_equal(C, C_loss)
def test_class_weights(): # Test class weights. X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] clf = RidgeClassifier(class_weight=None) clf.fit(X, y) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 clf = RidgeClassifier(class_weight={1: 0.001}) clf.fit(X, y) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) # check if class_weight = 'balanced' can handle negative labels. clf = RidgeClassifier(class_weight='balanced') clf.fit(X, y) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # class_weight = 'balanced', and class_weight = None should return # same values when y has equal number of all labels X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]]) y = [1, 1, -1, -1] clf = RidgeClassifier(class_weight=None) clf.fit(X, y) clfa = RidgeClassifier(class_weight='balanced') clfa.fit(X, y) assert_equal(len(clfa.classes_), 2) assert_array_almost_equal(clf.coef_, clfa.coef_) assert_array_almost_equal(clf.intercept_, clfa.intercept_)
def test_ridge(): # Ridge regression convergence test using score # TODO: for this test to be robust, we should use a dataset instead # of np.random. rng = np.random.RandomState(0) alpha = 1.0 for solver in ("svd", "sparse_cg", "cholesky", "lsqr"): # With more samples than features n_samples, n_features = 6, 5 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, solver=solver) ridge.fit(X, y) assert_equal(ridge.coef_.shape, (X.shape[1], )) assert_greater(ridge.score(X, y), 0.47) if solver == "cholesky": # Currently the only solver to support sample_weight. ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert_greater(ridge.score(X, y), 0.47) # With more features than samples n_samples, n_features = 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) ridge = Ridge(alpha=alpha, solver=solver) ridge.fit(X, y) assert_greater(ridge.score(X, y), .9) if solver == "cholesky": # Currently the only solver to support sample_weight. ridge.fit(X, y, sample_weight=np.ones(n_samples)) assert_greater(ridge.score(X, y), 0.9)
def test_minibatch_set_init_size(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, init_size=666, random_state=42, n_init=1).fit(X) assert_equal(mb_k_means.init_size, 666) assert_equal(mb_k_means.init_size_, n_samples) _check_fitted_model(mb_k_means)
def test_int_input(): X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]] for dtype in [np.int32, np.int64]: X_int = np.array(X_list, dtype=dtype) X_int_csr = sp.csr_matrix(X_int) init_int = X_int[:2] fitted_models = [ KMeans(n_clusters=2).fit(X_int), KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int), # mini batch kmeans is very unstable on such a small dataset hence # we use many inits MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int), MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int), MiniBatchKMeans(n_clusters=2, batch_size=2, init=init_int, n_init=1).fit(X_int_csr), ] for km in fitted_models: assert_equal(km.cluster_centers_.dtype, np.float64) expected_labels = [0, 1, 1, 0, 0, 1] scores = np.array([v_measure_score(expected_labels, km.labels_) for km in fitted_models]) assert_array_equal(scores, np.ones(scores.shape[0]))
def test_randomized_pca_check_list(): """Test that the projection by RandomizedPCA on list data is correct""" X = [[1.0, 0.0], [0.0, 1.0]] X_transformed = RandomizedPCA(n_components=1, random_state=0).fit(X).transform(X) assert_equal(X_transformed.shape, (2, 1)) assert_almost_equal(X_transformed.mean(), 0.00, 2) assert_almost_equal(X_transformed.std(), 0.71, 2)
def test_cross_val_predict_input_types(): clf = Ridge() # Smoke test predictions = cval.cross_val_predict(clf, X, y) assert_equal(predictions.shape, (10,)) # test with multioutput y predictions = cval.cross_val_predict(clf, X_sparse, X) assert_equal(predictions.shape, (10, 2)) predictions = cval.cross_val_predict(clf, X_sparse, y) assert_array_equal(predictions.shape, (10,)) # test with multioutput y predictions = cval.cross_val_predict(clf, X_sparse, X) assert_array_equal(predictions.shape, (10, 2)) # test with X and y as list list_check = lambda x: isinstance(x, list) clf = CheckingClassifier(check_X=list_check) predictions = cval.cross_val_predict(clf, X.tolist(), y.tolist()) clf = CheckingClassifier(check_y=list_check) predictions = cval.cross_val_predict(clf, X, y.tolist()) # test with 3d X and X_3d = X[:, :, np.newaxis] check_3d = lambda x: x.ndim == 3 clf = CheckingClassifier(check_X=check_3d) predictions = cval.cross_val_predict(clf, X_3d, y) assert_array_equal(predictions.shape, (10,))
def test_stratified_kfold_no_shuffle(): # Manually check that StratifiedKFold preserves the data ordering as much # as possible on toy datasets in order to avoid hiding sample dependencies # when possible X, y = np.ones(4), [1, 1, 0, 0] splits = StratifiedKFold(2).split(X, y) train, test = next(splits) assert_array_equal(test, [0, 2]) assert_array_equal(train, [1, 3]) train, test = next(splits) assert_array_equal(test, [1, 3]) assert_array_equal(train, [0, 2]) X, y = np.ones(7), [1, 1, 1, 0, 0, 0, 0] splits = StratifiedKFold(2).split(X, y) train, test = next(splits) assert_array_equal(test, [0, 1, 3, 4]) assert_array_equal(train, [2, 5, 6]) train, test = next(splits) assert_array_equal(test, [2, 5, 6]) assert_array_equal(train, [0, 1, 3, 4]) # Check if get_n_splits returns the number of folds assert_equal(5, StratifiedKFold(5).get_n_splits(X, y))
def test_load_fake_lfw_pairs(): lfw_pairs_train = fetch_lfw_pairs(data_home=SCIKIT_LEARN_DATA, download_if_missing=False) # The data is croped around the center as a rectangular bounding box # around the face. Colors are converted to gray levels: assert_equal(lfw_pairs_train.pairs.shape, (10, 2, 62, 47)) # the target is whether the person is the same or not assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) # names of the persons can be found using the target_names array expected_classes = ['Different persons', 'Same person'] assert_array_equal(lfw_pairs_train.target_names, expected_classes) # It is possible to ask for the original data without any croping or color # conversion lfw_pairs_train = fetch_lfw_pairs(data_home=SCIKIT_LEARN_DATA, resize=None, slice_=None, color=True, download_if_missing=False) assert_equal(lfw_pairs_train.pairs.shape, (10, 2, 250, 250, 3)) # the ids and class names are the same as previously assert_array_equal(lfw_pairs_train.target, [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) assert_array_equal(lfw_pairs_train.target_names, expected_classes)
def test_cross_val_predict_with_method(): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) classes = len(set(y)) kfold = KFold(len(iris.target)) methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() predictions = cross_val_predict(est, X, y, method=method) assert_equal(len(predictions), len(y)) expected_predictions = np.zeros([len(y), classes]) func = getattr(est, method) # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) expected_predictions[test] = func(X[test]) predictions = cross_val_predict(est, X, y, method=method, cv=kfold) assert_array_almost_equal(expected_predictions, predictions)
def test_set_random_state(): lda = LDA() tree = DecisionTreeClassifier() # LDA doesn't have random state: smoke test set_random_state(lda, 3) set_random_state(tree, 3) assert_equal(tree.random_state, 3)
def test_ovr_always_present(): """Test that ovr works with classes that are always present or absent """ # Note: tests is the case where _ConstantPredictor is utilised X = np.ones((10, 2)) X[:5, :] = 0 y = np.zeros((10, 3)) y[5:, 0] = 1 y[:, 1] = 1 y[:, 2] = 1 [[int(i >= 5), 2, 3] for i in range(10)] ovr = OneVsRestClassifier(LogisticRegression()) assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) assert_equal(np.unique(y_pred[:, -2:]), 1) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.ones(X.shape[0])) # y has a constantly absent label y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
def test_ovr_multilabel(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"], ["ham", "eggs"], ["ham"]] #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] Y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], [1, 0, 1], [1, 0, 0]]) classes = set("ham eggs spam".split()) for base_clf in (MultinomialNB(), LinearSVC(random_state=0), LinearRegression(), Ridge(), ElasticNet(), Lasso(alpha=0.5)): # test input as lists of tuples clf = assert_warns(DeprecationWarning, OneVsRestClassifier(base_clf).fit, X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict([[0, 4, 4]])[0] assert_equal(set(y_pred), set(["spam", "eggs"])) assert_true(clf.multilabel_) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[0, 4, 4]])[0] assert_array_equal(y_pred, [0, 1, 1]) assert_true(clf.multilabel_)
def test_inverse_transform(): # Test FastICA.inverse_transform n_features = 10 n_samples = 100 n1, n2 = 5, 10 rng = np.random.RandomState(0) X = rng.random_sample((n_samples, n_features)) expected = {(True, n1): (n_features, n1), (True, n2): (n_features, n2), (False, n1): (n_features, n2), (False, n2): (n_features, n2)} for whiten in [True, False]: for n_components in [n1, n2]: n_components_ = (n_components if n_components is not None else X.shape[1]) ica = FastICA(n_components=n_components, random_state=rng, whiten=whiten) with warnings.catch_warnings(record=True): # catch "n_components ignored" warning Xt = ica.fit_transform(X) expected_shape = expected[(whiten, n_components_)] assert_equal(ica.mixing_.shape, expected_shape) X2 = ica.inverse_transform(Xt) assert_equal(X.shape, X2.shape) # reversibility test in non-reduction case if n_components == X.shape[1]: assert_array_almost_equal(X, X2)
def test_ovo_ties(): # test that ties are broken using the decision function, not defaulting to # the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) # recalculate votes to make sure we have a tie predictions = np.vstack([clf.predict(X) for clf in multi_clf.estimators_]) scores = np.vstack([clf.decision_function(X) for clf in multi_clf.estimators_]) # classifiers are in order 0-1, 0-2, 1-2 # aggregate votes: votes = np.zeros((4, 3)) votes[np.arange(4), predictions[0]] += 1 votes[np.arange(4), 2 * predictions[1]] += 1 votes[np.arange(4), 1 + predictions[2]] += 1 # for the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # for the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # for the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], 0) # in the zero-one classifier, the score for 0 is greater than the score for # one. assert_greater(scores[0][0], scores[0][1]) # score for one is greater than score for zero assert_greater(scores[2, 0] - scores[0, 0], scores[0, 0] + scores[1, 0]) # score for one is greater than score for two assert_greater(scores[2, 0] - scores[0, 0], -scores[1, 0] - scores[2, 0])
def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def test_make_swiss_roll(): X, t = make_swiss_roll(n_samples=5, noise=0.0, random_state=0) assert_equal(X.shape, (5, 3), "X shape mismatch") assert_equal(t.shape, (5,), "t shape mismatch") assert_array_equal(X[:, 0], t * np.cos(t)) assert_array_equal(X[:, 2], t * np.sin(t))
def check_warm_start(name, random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = ForestEstimator(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert_equal(len(clf_ws), n_estimators) clf_no_ws = ForestEstimator(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert_equal(set([tree.random_state for tree in clf_ws]), set([tree.random_state for tree in clf_no_ws])) assert_array_equal(clf_ws.apply(X), clf_no_ws.apply(X), err_msg="Failed with {0}".format(name))
def check_warm_start_oob(name): # Test that the warm start computes oob score when asked. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. clf = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=True) clf.fit(X, y) clf_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, random_state=1, bootstrap=True, oob_score=False) clf_2.fit(X, y) clf_2.set_params(warm_start=True, oob_score=True, n_estimators=15) clf_2.fit(X, y) assert_true(hasattr(clf_2, 'oob_score_')) assert_equal(clf.oob_score_, clf_2.oob_score_) # Test that oob_score is computed even if we don't need to train # additional trees. clf_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, random_state=1, bootstrap=True, oob_score=False) clf_3.fit(X, y) assert_true(not(hasattr(clf_3, 'oob_score_'))) clf_3.set_params(oob_score=True) ignore_warnings(clf_3.fit)(X, y) assert_equal(clf.oob_score_, clf_3.oob_score_)
def test_compute_full_tree(): """Test that the full tree is computed if n_clusters is small""" rng = np.random.RandomState(0) X = rng.randn(10, 2) connectivity = kneighbors_graph(X, 5, include_self=False) # When n_clusters is less, the full tree should be built # that is the number of merges should be n_samples - 1 agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert_equal(n_nodes, n_samples - 1) # When n_clusters is large, greater than max of 100 and 0.02 * n_samples. # we should stop when there are n_clusters. n_clusters = 101 X = rng.randn(200, 2) connectivity = kneighbors_graph(X, 10, include_self=False) agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert_equal(n_nodes, n_samples - n_clusters)
def test_check_estimator_clones(): # check that check_estimator doesn't modify the estimator it receives from sklearn.datasets import load_iris iris = load_iris() for Estimator in [GaussianMixture, LinearRegression, RandomForestClassifier, NMF, SGDClassifier, MiniBatchKMeans]: with ignore_warnings(category=FutureWarning): # when 'est = SGDClassifier()' est = Estimator() set_checking_parameters(est) set_random_state(est) # without fitting old_hash = joblib.hash(est) check_estimator(est) assert_equal(old_hash, joblib.hash(est)) with ignore_warnings(category=FutureWarning): # when 'est = SGDClassifier()' est = Estimator() set_checking_parameters(est) set_random_state(est) # with fitting est.fit(iris.data + 10, iris.target) old_hash = joblib.hash(est) check_estimator(est) assert_equal(old_hash, joblib.hash(est))
def test_make_s_curve(): X, t = make_s_curve(n_samples=5, noise=0.0, random_state=0) assert_equal(X.shape, (5, 3), "X shape mismatch") assert_equal(t.shape, (5,), "t shape mismatch") assert_array_equal(X[:, 0], np.sin(t)) assert_array_equal(X[:, 2], np.sign(t) * (np.cos(t) - 1))
def test_symmetry(): """Test the symmetry of score and loss functions""" random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20, )) y_pred = random_state.randint(0, 2, size=(20, )) # We shouldn't forget any metrics assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS, THRESHOLDED_METRICS, METRIC_UNDEFINED_MULTICLASS), set(ALL_METRICS)) assert_equal( set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)), set([])) # Symmetric metric for name in SYMMETRIC_METRICS: metric = ALL_METRICS[name] assert_almost_equal(metric(y_true, y_pred), metric(y_pred, y_true), err_msg="%s is not symmetric" % name) # Not symmetric metrics for name in NOT_SYMMETRIC_METRICS: metric = ALL_METRICS[name] assert_true(np.any(metric(y_true, y_pred) != metric(y_pred, y_true)), msg="%s seems to be symmetric" % name)
def check_classifiers_input_shapes(name, Classifier): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=1) X = StandardScaler().fit_transform(X) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() set_fast_parameters(classifier) set_random_state(classifier) # fit classifier.fit(X, y) y_pred = classifier.predict(X) set_random_state(classifier) # Check that when a 2D y is given, a DataConversionWarning is # raised with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DataConversionWarning) warnings.simplefilter("ignore", RuntimeWarning) classifier.fit(X, y[:, np.newaxis]) msg = "expected 1 DataConversionWarning, got: %s" % ( ", ".join([str(w_x) for w_x in w])) assert_equal(len(w), 1, msg) assert_array_equal(y_pred, classifier.predict(X))
def test_format_invariance_with_1d_vectors(): random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) y1_list = list(y1) y2_list = list(y2) y1_1d, y2_1d = np.array(y1), np.array(y2) assert_equal(y1_1d.ndim, 1) assert_equal(y2_1d.ndim, 1) y1_column = np.reshape(y1_1d, (-1, 1)) y2_column = np.reshape(y2_1d, (-1, 1)) y1_row = np.reshape(y1_1d, (1, -1)) y2_row = np.reshape(y2_1d, (1, -1)) for name, metric in ALL_METRICS.items(): if name in METRIC_UNDEFINED_MULTICLASS: continue measure = metric(y1, y2) assert_almost_equal(metric(y1_list, y2_list), measure, err_msg="%s is not representation invariant " "with list" % name) assert_almost_equal(metric(y1_1d, y2_1d), measure, err_msg="%s is not representation invariant " "with np-array-1d" % name) assert_almost_equal(metric(y1_column, y2_column), measure, err_msg="%s is not representation invariant " "with np-array-column" % name) # Mix format support assert_almost_equal(metric(y1_1d, y2_list), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and list" % name) assert_almost_equal(metric(y1_list, y2_1d), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and list" % name) assert_almost_equal(metric(y1_1d, y2_column), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and np-array-column" % name) assert_almost_equal(metric(y1_column, y2_1d), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and np-array-column" % name) assert_almost_equal(metric(y1_list, y2_column), measure, err_msg="%s is not representation invariant " "with mix list and np-array-column" % name) assert_almost_equal(metric(y1_column, y2_list), measure, err_msg="%s is not representation invariant " "with mix list and np-array-column" % name) # These mix representations aren't allowed assert_raises(ValueError, metric, y1_1d, y2_row) assert_raises(ValueError, metric, y1_row, y2_1d) assert_raises(ValueError, metric, y1_list, y2_row) assert_raises(ValueError, metric, y1_row, y2_list) assert_raises(ValueError, metric, y1_column, y2_row) assert_raises(ValueError, metric, y1_row, y2_column) # NB: We do not test for y1_row, y2_row as these may be # interpreted as multilabel or multioutput data. if (name not in (MULTIOUTPUT_METRICS + THRESHOLDED_MULTILABEL_METRICS + MULTILABELS_METRICS)): assert_raises(ValueError, metric, y1_row, y2_row)
def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # In the event of cross validation score ties, the expected behavior of # RFECV is to return the FEWEST features that maximize the CV score. # Because test_scorer always returns 1.0 in this example, RFECV should # reduce the dimensionality to a single feature (i.e. n_features_ = 1) assert_equal(rfecv.n_features_, 1) # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2) rfecv.fit(X, y) assert_equal(len(rfecv.grid_scores_), 6) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Verifying that steps < 1 don't blow up. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data)
def test_patch_extractor_max_patches_default(): faces = face_collection extr = PatchExtractor(max_patches=100, random_state=0) patches = extr.transform(faces) assert_equal(patches.shape, (len(faces) * 100, 19, 25))
def histogram(x, y, **kwargs): # Histogram kernel implemented as a callable. assert_equal(kwargs, {}) # no kernel_params that we didn't ask for return np.minimum(x, y).sum()
def test_fastica_simple(add_noise=False): # Test the FastICA algorithm on very simple data. rng = np.random.RandomState(0) # scipy.stats uses the global RNG: np.random.seed(0) n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x**3, (3 * x**2).mean(axis=-1) algos = ['parallel', 'deflation'] nls = ['logcosh', 'exp', 'cube', g_test] whitening = [True, False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo) assert_raises(ValueError, fastica, m.T, fun=np.tanh, algorithm=algo) else: X = PCA(n_components=2, whiten=True).fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False) assert_raises(ValueError, fastica, X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2) else: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=0) ica = FastICA(fun=nl, algorithm=algo, random_state=0) sources = ica.fit_transform(m.T) assert_equal(ica.components_.shape, (2, 2)) assert_equal(sources.shape, (1000, 2)) assert_array_almost_equal(sources_fun, sources) assert_array_almost_equal(sources, ica.transform(m.T)) assert_equal(ica.mixing_.shape, (2, 2)) for fn in [np.tanh, "exp(-.5(x^2))"]: ica = FastICA(fun=fn, algorithm=algo, random_state=0) assert_raises(ValueError, ica.fit, m.T) assert_raises(TypeError, FastICA(fun=moves.xrange(10)).fit, m.T)
def check_classifiers_train(name, Classifier): X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # catch deprecation warnings classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape with warnings.catch_warnings(record=True): classifier = Classifier() if name in ['BernoulliNB', 'MultinomialNB']: X -= X.min() set_fast_parameters(classifier) # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) # with lists classifier.fit(X.tolist(), y.tolist()) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance if name not in ['BernoulliNB', 'MultinomialNB']: assert_greater(accuracy_score(y, y_pred), 0.85) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict: decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): # predict_proba agrees with predict: y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T)
def _check_transformer(name, Transformer, X, y): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # on numpy & scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if name == "KernelPCA": transformer.remove_zero_eig = False set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) # raises error on malformed input for transform if hasattr(X, 'T'): # If it's not an array, it does not have a 'T' property assert_raises(ValueError, transformer.transform, X.T)
def test_rejects_unfitted_classifiers_as_compilable(self): for cls in CLASSIFIERS: assert_equal(CompiledClassifierPredictor.compilable(cls()), False) assert_raises(ValueError, CompiledRegressionPredictor, cls())
def test_pickle(self): obj = KModes() s = pickle.dumps(obj) assert_equal(type(pickle.loads(s)), obj.__class__)
def test_rejects_regressors_as_compilable(self): for cls in REGRESSORS | UNSUPPORTED_CLASSIFIERS: assert_equal(CompiledClassifierPredictor.compilable(cls()), False) assert_raises(ValueError, CompiledRegressionPredictor, cls())
def test_index_offset(): # Make sure translating between 1D and N-D indices are preserved assert_equal(_barnes_hut_tsne.test_index2offset(), 1) assert_equal(_barnes_hut_tsne.test_index_offset(), 1)
def test_rejects_unfitted_regressors_as_compilable(self): for cls in REGRESSORS: assert_equal(CompiledRegressionPredictor.compilable(cls()), False) assert_raises(ValueError, CompiledRegressionPredictor, cls())
def test_ovr_fit_predict_svc(): ovr = OneVsRestClassifier(svm.SVC(gamma="scale")) ovr.fit(iris.data, iris.target) assert_equal(len(ovr.estimators_), 3) assert_greater(ovr.score(iris.data, iris.target), .9)
def clusterer_ensemble_scores(original_labels, n_estimators, n_clusters, weights=None, return_results=False, reference_idx=0): """Function to align the raw clustering results from base estimators. Different from ClustererEnsemble class, this function takes in the output from base estimators directly without training and prediction. Parameters ---------- original_labels : numpy array of shape (n_samples, n_estimators) The raw output from base estimators n_estimators : int The number of base estimators. n_clusters : int, optional (default=8) The number of clusters. weights : numpy array of shape (1, n_estimators) Estimators weights. return_results : bool, optional (default=False) If True, also return the aligned label matrix. reference_idx : int in range [0, n_estimators-1], optional (default=0) The ith base estimator used as the reference for label alignment. Returns ------- aligned_labels : numpy array of shape (n_samples, n_estimators) The aligned label results by using reference_idx estimator as the reference. """ original_labels = _validate_cluster_number(original_labels, n_clusters) alignment_mat = np.zeros([n_clusters, n_estimators]) aligned_labels = np.copy(original_labels) for i in range(n_estimators): inter_mat = _intersection_mat(original_labels, reference_idx, i, n_clusters) index_mapping = _alignment(inter_mat, n_clusters, i, aligned_labels, OFFSET_FACTOR) alignment_mat[:, i] = index_mapping[:, 1] aligned_labels = aligned_labels - OFFSET_FACTOR if weights is not None: assert_equal(original_labels.shape[1], weights.shape[1]) # equal weights if not set else: weights = np.ones([1, n_estimators]) labels_by_vote = majority_vote(aligned_labels, n_classes=n_clusters, weights=weights) if return_results: return labels_by_vote, aligned_labels else: return labels_by_vote
def test_n_iter(): # Test that self.n_iter_ has the correct format. X, y = iris.data, iris.target y_bin = y.copy() y_bin[y_bin == 2] = 0 n_Cs = 4 n_cv_fold = 2 for solver in ['newton-cg', 'liblinear', 'sag', 'lbfgs']: # OvR case n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0] clf = LogisticRegression(tol=1e-2, multi_class='ovr', solver=solver, C=1., random_state=42, max_iter=100) clf.fit(X, y) assert_equal(clf.n_iter_.shape, (n_classes, )) n_classes = np.unique(y).shape[0] clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr', solver=solver, Cs=n_Cs, cv=n_cv_fold, random_state=42, max_iter=100) clf.fit(X, y) assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) clf.fit(X, y_bin) assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) # multinomial case n_classes = 1 if solver in ('liblinear', 'sag'): break clf = LogisticRegression(tol=1e-2, multi_class='multinomial', solver=solver, C=1., random_state=42, max_iter=100) clf.fit(X, y) assert_equal(clf.n_iter_.shape, (n_classes, )) clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial', solver=solver, Cs=n_Cs, cv=n_cv_fold, random_state=42, max_iter=100) clf.fit(X, y) assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) clf.fit(X, y_bin) assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs))
def test_gradient_descent_stops(): # Test stopping conditions of gradient descent. class ObjectiveSmallGradient: def __init__(self): self.it = -1 def __call__(self, _): self.it += 1 return (10 - self.it) / 10.0, np.array([1e-5]) def flat_function(_): return 0.0, np.ones(1) # Gradient norm old_stdout = sys.stdout sys.stdout = StringIO() try: _, error, it = _gradient_descent(ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=100, n_iter_without_progress=100, momentum=0.0, learning_rate=0.0, min_gain=0.0, min_grad_norm=1e-5, min_error_diff=0.0, verbose=2) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout assert_equal(error, 1.0) assert_equal(it, 0) assert ("gradient norm" in out) # Error difference old_stdout = sys.stdout sys.stdout = StringIO() try: _, error, it = _gradient_descent(ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=100, n_iter_without_progress=100, momentum=0.0, learning_rate=0.0, min_gain=0.0, min_grad_norm=0.0, min_error_diff=0.2, verbose=2) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout assert_equal(error, 0.9) assert_equal(it, 1) assert ("error difference" in out) # Maximum number of iterations without improvement old_stdout = sys.stdout sys.stdout = StringIO() try: _, error, it = _gradient_descent(flat_function, np.zeros(1), 0, n_iter=100, n_iter_without_progress=10, momentum=0.0, learning_rate=0.0, min_gain=0.0, min_grad_norm=0.0, min_error_diff=-1.0, verbose=2) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout assert_equal(error, 0.0) assert_equal(it, 11) assert ("did not make any progress" in out) # Maximum number of iterations old_stdout = sys.stdout sys.stdout = StringIO() try: _, error, it = _gradient_descent(ObjectiveSmallGradient(), np.zeros(1), 0, n_iter=11, n_iter_without_progress=100, momentum=0.0, learning_rate=0.0, min_gain=0.0, min_grad_norm=0.0, min_error_diff=0.0, verbose=2) finally: out = sys.stdout.getvalue() sys.stdout.close() sys.stdout = old_stdout assert_equal(error, 0.0) assert_equal(it, 10) assert ("Iteration 10" in out)
def test_ovr_multinomial_iris(): # Test that OvR and multinomial are correct using the iris dataset. train, target = iris.data, iris.target n_samples, n_features = train.shape # Use pre-defined fold as folds generated for different y cv = StratifiedKFold(target, 3) clf = LogisticRegressionCV(cv=cv) clf.fit(train, target) clf1 = LogisticRegressionCV(cv=cv) target_copy = target.copy() target_copy[target_copy == 0] = 1 clf1.fit(train, target_copy) assert_array_almost_equal(clf.scores_[2], clf1.scores_[2]) assert_array_almost_equal(clf.intercept_[2:], clf1.intercept_) assert_array_almost_equal(clf.coef_[2][np.newaxis, :], clf1.coef_) # Test the shape of various attributes. assert_equal(clf.coef_.shape, (3, n_features)) assert_array_equal(clf.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, 3, 10, n_features + 1)) assert_equal(clf.Cs_.shape, (10, )) scores = np.asarray(list(clf.scores_.values())) assert_equal(scores.shape, (3, 3, 10)) # Test that for the iris data multinomial gives a better accuracy than OvR for solver in ['lbfgs', 'newton-cg']: clf_multi = LogisticRegressionCV(solver=solver, multi_class='multinomial', max_iter=15) clf_multi.fit(train, target) multi_score = clf_multi.score(train, target) ovr_score = clf.score(train, target) assert_greater(multi_score, ovr_score) # Test attributes of LogisticRegressionCV assert_equal(clf.coef_.shape, clf_multi.coef_.shape) assert_array_equal(clf_multi.classes_, [0, 1, 2]) coefs_paths = np.asarray(list(clf_multi.coefs_paths_.values())) assert_array_almost_equal(coefs_paths.shape, (3, 3, 10, n_features + 1)) assert_equal(clf_multi.Cs_.shape, (10, )) scores = np.asarray(list(clf_multi.scores_.values())) assert_equal(scores.shape, (3, 3, 10))
def conduct_test(base_clf, test_predict_proba=False): clf = OneVsRestClassifier(base_clf).fit(X, y) assert_equal(set(clf.classes_), classes) y_pred = clf.predict(np.array([[0, 0, 4]]))[0] assert_equal(set(y_pred), set("eggs")) if hasattr(base_clf, 'decision_function'): dec = clf.decision_function(X) assert_equal(dec.shape, (5, )) if test_predict_proba: X_test = np.array([[0, 0, 4]]) probabilities = clf.predict_proba(X_test) assert_equal(2, len(probabilities[0])) assert_equal(clf.classes_[np.argmax(probabilities, axis=1)], clf.predict(X_test)) # test input as label indicator matrix clf = OneVsRestClassifier(base_clf).fit(X, Y) y_pred = clf.predict([[3, 0, 0]])[0] assert_equal(y_pred, 1)
def test_binarizer(): X_ = np.array([[1, 0, 5], [2, 3, 0]]) for init in (np.array, sp.csr_matrix, sp.csc_matrix): X = init(X_.copy()) binarizer = Binarizer(threshold=2.0, copy=True) X_bin = toarray(binarizer.transform(X)) assert_equal(np.sum(X_bin == 0), 4) assert_equal(np.sum(X_bin == 1), 2) X_bin = binarizer.transform(X) assert_equal(type(X), type(X_bin)) binarizer = Binarizer(copy=True).fit(X) X_bin = toarray(binarizer.transform(X)) assert_true(X_bin is not X) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=True) X_bin = binarizer.transform(X) assert_true(X_bin is not X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4) binarizer = Binarizer(copy=False) X_bin = binarizer.transform(X) assert_true(X_bin is X) X_bin = toarray(X_bin) assert_equal(np.sum(X_bin == 0), 2) assert_equal(np.sum(X_bin == 1), 4)
def test_logreg_intercept_scaling_zero(): # Test that intercept_scaling is ignored when fit_intercept is False clf = LogisticRegression(fit_intercept=False) clf.fit(X, Y1) assert_equal(clf.intercept_, 0.)
def test_sparse_output_multilabel_binarizer(): # test input as iterable of iterables inputs = [ lambda: [(2, 3), (1,), (1, 2)], lambda: (set([2, 3]), set([1]), set([1, 2])), lambda: iter([iter((2, 3)), iter((1,)), set([1, 2])]), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) inverse = inputs[0]() for sparse_output in [True, False]: for inp in inputs: # With fit_tranform mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit_transform(inp()) assert_equal(issparse(got), sparse_output) if sparse_output: # verify CSR assumption that indices and indptr have same dtype assert_equal(got.indices.dtype, got.indptr.dtype) got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) # With fit mlb = MultiLabelBinarizer(sparse_output=sparse_output) got = mlb.fit(inp()).transform(inp()) assert_equal(issparse(got), sparse_output) if sparse_output: # verify CSR assumption that indices and indptr have same dtype assert_equal(got.indices.dtype, got.indptr.dtype) got = got.toarray() assert_array_equal(indicator_mat, got) assert_array_equal([1, 2, 3], mlb.classes_) assert_equal(mlb.inverse_transform(got), inverse) assert_raises(ValueError, mlb.inverse_transform, csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
def test_perfect_matches(): for score_func in score_funcs: assert_equal(score_func([], []), 1.0) assert_equal(score_func([0], [1]), 1.0) assert_equal(score_func([0, 0, 0], [0, 0, 0]), 1.0) assert_equal(score_func([0, 1, 0], [42, 7, 42]), 1.0) assert_equal(score_func([0., 1., 0.], [42., 7., 42.]), 1.0) assert_equal(score_func([0., 1., 2.], [42., 7., 2.]), 1.0) assert_equal(score_func([0, 1, 2], [42, 7, 2]), 1.0)
def test_stratified_shuffle_split_even(): # Test the StratifiedShuffleSplit, indices are drawn with a # equal chance n_folds = 5 n_iter = 1000 def assert_counts_are_ok(idx_counts, p): # Here we test that the distribution of the counts # per index is close enough to a binomial threshold = 0.05 / n_splits bf = stats.binom(n_splits, p) for count in idx_counts: p = bf.pmf(count) assert_true( p > threshold, "An index is not drawn with chance corresponding " "to even draws") for n_samples in (6, 22): labels = np.array((n_samples // 2) * [0, 1]) splits = cval.StratifiedShuffleSplit(labels, n_iter=n_iter, test_size=1. / n_folds, random_state=0) train_counts = [0] * n_samples test_counts = [0] * n_samples n_splits = 0 for train, test in splits: n_splits += 1 for counter, ids in [(train_counts, train), (test_counts, test)]: for id in ids: counter[id] += 1 assert_equal(n_splits, n_iter) assert_equal(len(train), splits.n_train) assert_equal(len(test), splits.n_test) assert_equal(len(set(train).intersection(test)), 0) label_counts = np.unique(labels) assert_equal(splits.test_size, 1.0 / n_folds) assert_equal(splits.n_train + splits.n_test, len(labels)) assert_equal(len(label_counts), 2) ex_test_p = float(splits.n_test) / n_samples ex_train_p = float(splits.n_train) / n_samples assert_counts_are_ok(train_counts, ex_train_p) assert_counts_are_ok(test_counts, ex_test_p)
def check_init_vals(optimizer, func, space, x0, n_calls): y0 = list(map(func, x0)) # testing whether the provided points with their evaluations # are taken into account res = optimizer(func, space, x0=x0, y0=y0, random_state=0, n_calls=n_calls) assert_array_equal(res.x_iters[0:len(x0)], x0) assert_array_equal(res.func_vals[0:len(y0)], y0) assert_equal(len(res.x_iters), len(x0) + n_calls) assert_equal(len(res.func_vals), len(x0) + n_calls) # testing whether the provided points are taken into account res = optimizer(func, space, x0=x0, random_state=0, n_calls=n_calls) assert_array_equal(res.x_iters[0:len(x0)], x0) assert_array_equal(res.func_vals[0:len(y0)], y0) assert_equal(len(res.x_iters), n_calls) assert_equal(len(res.func_vals), n_calls) # testing whether providing a single point instead of a list # of points works correctly res = optimizer(func, space, x0=x0[0], random_state=0, n_calls=n_calls) assert_array_equal(res.x_iters[0], x0[0]) assert_array_equal(res.func_vals[0], y0[0]) assert_equal(len(res.x_iters), n_calls) assert_equal(len(res.func_vals), n_calls) # testing whether providing a single point and its evaluation # instead of a list of points and their evaluations works correctly res = optimizer(func, space, x0=x0[0], y0=y0[0], random_state=0, n_calls=n_calls) assert_array_equal(res.x_iters[0], x0[0]) assert_array_equal(res.func_vals[0], y0[0]) assert_equal(len(res.x_iters), 1 + n_calls) assert_equal(len(res.func_vals), 1 + n_calls) # testing whether it correctly raises an exception when # the number of input points and the number of evaluations differ assert_raises(ValueError, dummy_minimize, func, space, x0=x0, y0=[1])
def test_sgd_proba(self): # Check SGD.predict_proba # Hinge loss does not allow for conditional prob estimate. # We cannot use the factory here, because it defines predict_proba # anyway. clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y) assert_false(hasattr(clf, "predict_proba")) assert_false(hasattr(clf, "predict_log_proba")) # log and modified_huber losses can output probability estimates # binary case for loss in ["log", "modified_huber"]: clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X, Y) p = clf.predict_proba([[3, 2]]) assert_true(p[0, 1] > 0.5) p = clf.predict_proba([[-1, -1]]) assert_true(p[0, 1] < 0.5) p = clf.predict_log_proba([[3, 2]]) assert_true(p[0, 1] > p[0, 0]) p = clf.predict_log_proba([[-1, -1]]) assert_true(p[0, 1] < p[0, 0]) # log loss multiclass probability estimates clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2) d = clf.decision_function([[.1, -.1], [.3, .2]]) p = clf.predict_proba([[.1, -.1], [.3, .2]]) assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) assert_almost_equal(p[0].sum(), 1) assert_true(np.all(p[0] >= 0)) p = clf.predict_proba([[-1, -1]]) d = clf.decision_function([[-1, -1]]) assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) l = clf.predict_log_proba([[3, 2]]) p = clf.predict_proba([[3, 2]]) assert_array_almost_equal(np.log(p), l) l = clf.predict_log_proba([[-1, -1]]) p = clf.predict_proba([[-1, -1]]) assert_array_almost_equal(np.log(p), l) # Modified Huber multiclass probability estimates; requires a separate # test because the hard zero/one probabilities may destroy the # ordering present in decision_function output. clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X2, Y2) d = clf.decision_function([[3, 2]]) p = clf.predict_proba([[3, 2]]) if not isinstance(self, SparseSGDClassifierTestCase): assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) else: # XXX the sparse test gets a different X2 (?) assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) # the following sample produces decision_function values < -1, # which would cause naive normalization to fail (see comment # in SGDClassifier.predict_proba) x = X.mean(axis=0) d = clf.decision_function([x]) if np.all(d < -1): # XXX not true in sparse test case (why?) p = clf.predict_proba([x]) assert_array_almost_equal(p[0], [1 / 3.] * 3)
def test_bootstrap_test_sizes(): assert_equal(cval.Bootstrap(10, test_size=0.2).test_size, 2) assert_equal(cval.Bootstrap(10, test_size=2).test_size, 2) assert_equal(cval.Bootstrap(10, test_size=None).test_size, 5)
def test_non_encoded_labels(): dataset = datasets.load_iris() X = dataset.data labels = dataset.target assert_equal(silhouette_score(X, labels + 10), silhouette_score(X, labels))
def test_sgd(self): # Check that SGD gives any results. clf = self.factory(alpha=0.1, n_iter=2, fit_intercept=False) clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) assert_equal(clf.coef_[0], clf.coef_[1])
def test_non_numpy_labels(): dataset = datasets.load_iris() X = dataset.data y = dataset.target assert_equal(silhouette_score(list(X), list(y)), silhouette_score(X, y))
def test_scores(self): predicted_labels = self.estimator.labels_ assert_equal(predicted_labels.shape[0], self.X.shape[0])