async def test_model_random_determinism(c, s, a, b): # choose so d == n//10. Then each partial_fit call is very # unstable, so models will vary a lot. n, d = 50, 5 X, y = make_classification(n_samples=n, n_features=d, chunks=n // 10, random_state=0) params = { "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], "average": [True, False], "learning_rate": ["constant", "invscaling", "optimal"], "eta0": np.logspace(-2, 0, num=1000), } model = SGDClassifier(random_state=1) kwargs = dict(n_initial_parameters=10, random_state=2, max_iter=10) search1 = InverseDecaySearchCV(model, params, **kwargs) await search1.fit(X, y, classes=[0, 1]) search2 = InverseDecaySearchCV(clone(model), params, **kwargs) await search2.fit(X, y, classes=[0, 1]) assert search1.best_score_ == search2.best_score_ assert search1.best_params_ == search2.best_params_ assert np.allclose(search1.best_estimator_.coef_, search2.best_estimator_.coef_)
async def test_same_params_with_random_state(c, s, a, b): X, y = make_classification(n_samples=100, n_features=10, chunks=10, random_state=0) model = SGDClassifier(tol=1e-3, penalty="elasticnet", random_state=1) params = {"alpha": scipy.stats.uniform(1e-4, 1)} # Use InverseDecaySearchCV to decay the models and make sure the same ones # are selected kwargs = dict(n_initial_parameters=10, random_state=2) search1 = InverseDecaySearchCV(clone(model), params, **kwargs) await search1.fit(X, y, classes=[0, 1]) params1 = search1.cv_results_["param_alpha"] search2 = InverseDecaySearchCV(clone(model), params, **kwargs) await search2.fit(X, y, classes=[0, 1]) params2 = search2.cv_results_["param_alpha"] assert np.allclose(params1, params2)
def test_warns_decay_rate_wanted(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() search = IncrementalSearchCV( model, params, max_iter=5, n_initial_parameters=5, decay_rate=1 ) match = "decay_rate is deprecated .* Use InverseDecaySearchCV" with pytest.warns(FutureWarning, match=match): yield search.fit(X, y) # Make sure old behavior is retained w/o warning search = InverseDecaySearchCV(model, params, decay_rate=1) yield search.fit(X, y)
async def _test_search_basic(decay_rate, input_type, memory, c, s, a, b): X, y = make_classification(n_samples=1000, n_features=5, chunks=(100, 5)) assert isinstance(X, da.Array) if memory == "distributed" and input_type == "dataframe": X = dd.from_array(X) y = dd.from_array(y) assert isinstance(X, dd.DataFrame) elif memory == "local": X, y = await c.compute([X, y]) assert isinstance(X, np.ndarray) if input_type == "dataframe": X, y = pd.DataFrame(X), pd.DataFrame(y) assert isinstance(X, pd.DataFrame) model = SGDClassifier(tol=1e-3, loss="log", penalty="elasticnet") params = {"alpha": np.logspace(-2, 2, 100), "l1_ratio": np.linspace(0.01, 1, 200)} kwargs = dict(n_initial_parameters=20, max_iter=10) if decay_rate == 0: search = IncrementalSearchCV(model, params, **kwargs) elif decay_rate == 1: search = InverseDecaySearchCV(model, params, **kwargs) else: raise ValueError() await search.fit(X, y, classes=[0, 1]) assert search.history_ for d in search.history_: assert d["partial_fit_calls"] <= search.max_iter + 1 assert isinstance(search.best_estimator_, SGDClassifier) assert search.best_score_ > 0 assert "visualize" not in search.__dict__ assert search.best_params_ assert search.cv_results_ and isinstance(search.cv_results_, dict) assert { "mean_partial_fit_time", "mean_score_time", "std_partial_fit_time", "std_score_time", "test_score", "rank_test_score", "model_id", "params", "partial_fit_calls", "param_alpha", "param_l1_ratio", }.issubset(set(search.cv_results_.keys())) assert len(search.cv_results_["param_alpha"]) == 20 assert all(isinstance(v, np.ndarray) for v in search.cv_results_.values()) if decay_rate == 0: assert ( search.cv_results_["test_score"][search.best_index_] >= search.cv_results_["test_score"] ).all() assert search.cv_results_["rank_test_score"][search.best_index_] == 1 else: assert all(search.cv_results_["test_score"] >= 0) assert all(search.cv_results_["rank_test_score"] >= 1) assert all(search.cv_results_["partial_fit_calls"] >= 1) assert len(np.unique(search.cv_results_["model_id"])) == len( search.cv_results_["model_id"] ) assert sorted(search.model_history_.keys()) == list(range(20)) assert set(search.model_history_[0][0].keys()) == { "model_id", "params", "partial_fit_calls", "partial_fit_time", "score", "score_time", "elapsed_wall_time", } # Dask Objects are lazy X_ = await c.compute(X) proba = search.predict_proba(X) log_proba = search.predict_log_proba(X) assert proba.shape[1] == 2 assert proba.shape[0] == 1000 or math.isnan(proba.shape[0]) assert log_proba.shape[1] == 2 assert log_proba.shape[0] == 1000 or math.isnan(proba.shape[0]) assert isinstance(proba, da.Array) assert isinstance(log_proba, da.Array) proba_ = search.predict_proba(X_) log_proba_ = search.predict_log_proba(X_) da.utils.assert_eq(proba, proba_) da.utils.assert_eq(log_proba, log_proba_) decision = search.decision_function(X_) assert decision.shape == (1000,) or math.isnan(decision.shape[0]) return True