def test_grid_search_sparse_scoring(): X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]}, scoring="f1") cv.fit(X_[:180], y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert_array_equal(y_pred, y_pred2) assert C == C2 # Smoke test the score # np.testing.assert_allclose(f1_score(cv.predict(X_[:180]), y[:180]), # cv.score(X_[:180], y[:180])) # test loss where greater is worse def f1_loss(y_true_, y_pred_): return -f1_score(y_true_, y_pred_) F1Loss = make_scorer(f1_loss, greater_is_better=False) cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]}, scoring=F1Loss) cv.fit(X_[:180], y_[:180]) y_pred3 = cv.predict(X_[180:]) C3 = cv.best_estimator_.C assert C == C3 assert_array_equal(y_pred, y_pred3)
def test_grid_search_precomputed_kernel(): # Test that grid search works when the input features are given in the # form of a precomputed kernel matrix X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) # compute the training kernel matrix corresponding to the linear kernel K_train = np.dot(X_[:180], X_[:180].T) y_train = y_[:180] clf = SVC(kernel='precomputed') cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(K_train, y_train) assert cv.best_score_ >= 0 # compute the test kernel matrix K_test = np.dot(X_[180:], X_[:180].T) y_test = y_[180:] y_pred = cv.predict(K_test) assert np.mean(y_pred == y_test) >= 0 # test error is raised when the precomputed kernel is not array-like # or sparse with pytest.raises(ValueError): cv.fit(K_train.tolist(), y_train)
def test_grid_search_sparse(): # Test that grid search works with both dense and sparse matrices X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0) clf = LinearSVC() cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180], y_[:180]) y_pred = cv.predict(X_[180:]) C = cv.best_estimator_.C X_ = sp.csr_matrix(X_) clf = LinearSVC() cv = DaskGridSearchCV(clf, {'C': [0.1, 1.0]}) cv.fit(X_[:180].tocoo(), y_[:180]) y_pred2 = cv.predict(X_[180:]) C2 = cv.best_estimator_.C assert np.mean(y_pred == y_pred2) >= .9 assert C == C2
def test_pandas_input(): # check cross_val_score doesn't destroy pandas dataframe types = [(MockDataFrame, MockDataFrame)] try: from pandas import Series, DataFrame types.append((DataFrame, Series)) except ImportError: pass X = np.arange(100).reshape(10, 10) y = np.array([0] * 5 + [1] * 5) for InputFeatureType, TargetType in types: # X dataframe, y series X_df, y_ser = InputFeatureType(X), TargetType(y) clf = CheckingClassifier(check_X=lambda x: isinstance(x, InputFeatureType), check_y=lambda x: isinstance(x, TargetType)) grid_search = DaskGridSearchCV(clf, {'foo_param': [1, 2, 3]}) grid_search.fit(X_df, y_ser).score(X_df, y_ser) grid_search.predict(X_df) assert hasattr(grid_search, "cv_results_")
def test_pickle(): # Test that a fit search can be pickled clf = MockClassifier() grid_search = DaskGridSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True) grid_search.fit(X, y) grid_search_pickled = pickle.loads(pickle.dumps(grid_search)) assert_array_almost_equal(grid_search.predict(X), grid_search_pickled.predict(X)) random_search = DaskRandomizedSearchCV(clf, {'foo_param': [1, 2, 3]}, refit=True, n_iter=3) random_search.fit(X, y) random_search_pickled = pickle.loads(pickle.dumps(random_search)) assert_array_almost_equal(random_search.predict(X), random_search_pickled.predict(X))