def test_keras(c, s, a, b): # Mirror the mnist dataset X, y = make_classification(n_classes=10, n_features=784, n_informative=100) X = X.astype("float32") assert y.dtype == np.dtype("int64") model = KerasClassifier(build_fn=_keras_build_fn, lr=0.01, verbose=False) params = {"lr": loguniform(1e-3, 1e-1)} search = IncrementalSearchCV(model, params, max_iter=3, n_initial_parameters=5, decay_rate=None) yield search.fit(X, y) # search.fit(X, y) assert search.best_score_ >= 0 # Make sure the model trains, and scores aren't constant scores = { ident: [h["score"] for h in hist] for ident, hist in search.model_history_.items() } assert all(len(hist) == 3 for hist in scores.values()) nuniq_scores = [pd.Series(v).nunique() for v in scores.values()] assert max(nuniq_scores) > 1
def _test_verbosity(c, s, a, b): X, y = make_classification(n_samples=10, n_features=4, chunks=10) model = ConstantFunction() params = {"value": scipy.stats.uniform(0, 1)} search = IncrementalSearchCV(model, params, max_iter=max_iter, verbose=verbose) yield search.fit(X, y) return search
def test_pytorch(c, s, a, b): n_features = 10 defaults = { "callbacks": False, "warm_start": False, "train_split": None, "max_epochs": 1, } model = NeuralNetRegressor( module=ShallowNet, module__n_features=n_features, criterion=nn.MSELoss, optimizer=optim.SGD, optimizer__lr=0.1, batch_size=64, **defaults, ) model2 = clone(model) assert model.callbacks is False assert model.warm_start is False assert model.train_split is None assert model.max_epochs == 1 params = {"optimizer__lr": loguniform(1e-3, 1e0)} X, y = make_regression(n_samples=100, n_features=n_features) X = X.astype("float32") y = y.astype("float32").reshape(-1, 1) search = IncrementalSearchCV(model2, params, max_iter=5, decay_rate=None) yield search.fit(X, y) assert search.best_score_ >= 0
def test_min_max_iter(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) est = SGDClassifier() params = {"alpha": np.logspace(-3, 0)} search = IncrementalSearchCV(est, params, max_iter=0) with pytest.raises(ValueError, match="max_iter < 1 is not supported"): yield search.fit(X, y, classes=[0, 1])
def test_numpy_array(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) X, y = yield c.compute([X, y]) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = {"alpha": np.logspace(-2, 10, 10), "l1_ratio": np.linspace(0.01, 1, 20)} search = IncrementalSearchCV(model, params, n_initial_parameters=10) yield search.fit(X, y, classes=[0, 1])
def test_small(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = {"alpha": [0.1, 0.5, 0.75, 1.0]} search = IncrementalSearchCV(model, params, n_initial_parameters="grid") yield search.fit(X, y, classes=[0, 1]) (X_,) = yield c.compute([X]) search.predict(X_)
def test_search_max_iter(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = {"alpha": np.logspace(-2, 10, 10), "l1_ratio": np.linspace(0.01, 1, 20)} search = IncrementalSearchCV(model, params, n_initial_parameters=10, max_iter=1) yield search.fit(X, y, classes=[0, 1]) for d in search.history_: assert d["partial_fit_calls"] <= 1
def test_warns_scores_per_fit(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() search = IncrementalSearchCV(model, params, scores_per_fit=2) with pytest.warns(UserWarning, match="deprecated since Dask-ML v1.4.0"): yield search.fit(X, y)
def test_transform(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = MiniBatchKMeans(random_state=0) params = {"n_clusters": [3, 4, 5], "n_init": [1, 2]} search = IncrementalSearchCV(model, params, n_initial_parameters="grid") yield search.fit(X, y) X_, = yield c.compute([X]) result = search.transform(X_) assert result.shape == (100, search.best_estimator_.n_clusters)
async def test_smaller(c, s, a, b): # infinite loop X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = {"alpha": [0.1, 0.5]} search = IncrementalSearchCV(model, params, n_initial_parameters="grid") await search.fit(X, y, classes=[0, 1]) X_ = await c.compute(X) search.predict(X_)
def test_gridsearch_func(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = SGDClassifier(tol=1e-3) params = {"alpha": np.logspace(-2, 10, 3), "l1_ratio": np.linspace(0.01, 1, 2)} search = IncrementalSearchCV(model, params, n_initial_parameters="grid") yield search.fit(X, y, classes=[0, 1]) assert {frozenset(d["params"].items()) for d in search.history_} == { frozenset(d.items()) for d in ParameterGrid(params) }
def test_numpy_array(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) X, y = yield c.compute([X, y]) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = { "alpha": np.logspace(-5, -3, 10), "l1_ratio": np.linspace(0, 1, 20), } search = IncrementalSearchCV(model, params, n_initial_parameters=10, max_iter=10) yield search.fit(X, y, classes=[0, 1]) # smoke test to ensure search completed successfully assert search.best_score_ > 0
async def test_search_plateau_tol(c, s, a, b): model = LinearFunction(slope=1) params = {"foo": np.linspace(0, 1)} # every 3 calls, score will increase by 3. tol=1: model did improved enough search = IncrementalSearchCV(model, params, patience=3, tol=1, max_iter=10) X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) await search.fit(X, y) assert set(search.cv_results_["partial_fit_calls"]) == {10} # Every 3 calls, score increases by 3. tol=4: model didn't improve enough search = IncrementalSearchCV(model, params, patience=3, tol=4, max_iter=10) X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) await search.fit(X, y) assert set(search.cv_results_["partial_fit_calls"]) == {3}
async def test_warns_decay_rate(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() kwargs = dict(max_iter=5, n_initial_parameters=5) search = IncrementalSearchCV(model, params, **kwargs) match = r"deprecated since Dask-ML v1.4.0." with pytest.warns(FutureWarning, match=match): await search.fit(X, y) # Make sure the printed warning message works search = IncrementalSearchCV(model, params, decay_rate=None, **kwargs) await search.fit(X, y)
def test_warns_decay_rate_wanted(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() search = IncrementalSearchCV( model, params, max_iter=5, n_initial_parameters=5, decay_rate=1 ) match = "decay_rate is deprecated .* Use InverseDecaySearchCV" with pytest.warns(FutureWarning, match=match): yield search.fit(X, y) # Make sure old behavior is retained w/o warning search = InverseDecaySearchCV(model, params, decay_rate=1) yield search.fit(X, y)
def test_search_patience_infeasible_tol(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) rng = check_random_state(42) params = {"value": rng.rand(1000)} model = ConstantFunction() max_iter = 10 score_increase = -10 search = IncrementalSearchCV( model, params, max_iter=max_iter, patience=3, tol=score_increase, ) yield search.fit(X, y, classes=[0, 1]) hist = pd.DataFrame(search.history_) assert hist.partial_fit_calls.max() == max_iter
def test_search_plateau_patience(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) class ConstantClassifier(SGDClassifier): def __init__(self, value=0): self.value = value super(ConstantClassifier, self).__init__(tol=1e-3) def score(self, *args, **kwargs): return self.value params = {"value": np.random.rand(10)} model = ConstantClassifier() search = IncrementalSearchCV(model, params, n_initial_parameters=10, patience=5, tol=0, max_iter=10) yield search.fit(X, y, classes=[0, 1]) assert search.history_ for h in search.history_: assert h["partial_fit_calls"] <= 5 assert isinstance(search.best_estimator_, SGDClassifier) assert search.best_score_ == params["value"].max( ) == search.best_estimator_.value assert "visualize" not in search.__dict__ assert search.best_score_ > 0 X_test, y_test = yield c.compute([X, y]) search.predict(X_test) search.score(X_test, y_test)
async def test_search_invalid_patience(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() search = IncrementalSearchCV(model, params, patience=1, max_iter=10) with pytest.raises(ValueError, match="patience >= 2"): await search.fit(X, y, classes=[0, 1]) search = IncrementalSearchCV(model, params, patience=2.0, max_iter=10) with pytest.raises(ValueError, match="patience must be an integer"): await search.fit(X, y, classes=[0, 1]) # Make sure this passes search = IncrementalSearchCV(model, params, patience=False, max_iter=10) await search.fit(X, y, classes=[0, 1]) assert search.history_
async def test_verbosity_types(c, s, a, b): X, y = make_classification(n_samples=10, n_features=4, chunks=10) model = ConstantFunction() params = {"value": scipy.stats.uniform(0, 1)} for verbose in [-1.0, 1.2]: search = IncrementalSearchCV(model, params, verbose=verbose, max_iter=3) with pytest.raises(ValueError, match="0 <= verbose <= 1"): await search.fit(X, y) for verbose in [0.0, 0, 1, 1.0, True, False]: search = IncrementalSearchCV(model, params, verbose=verbose, max_iter=3) await search.fit(X, y)
async def test_search_basic_patience(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) rng = check_random_state(42) params = {"slope": 2 + rng.rand(1000)} model = LinearFunction() # Test the case where tol to small (all models finish) max_iter = 15 patience = 5 increase_after_patience = patience search = IncrementalSearchCV( model, params, max_iter=max_iter, tol=increase_after_patience, patience=patience, fits_per_score=3, ) await search.fit(X, y, classes=[0, 1]) hist = pd.DataFrame(search.history_) # +1 (and +2 below) because scores_per_fit isn't exact assert hist.partial_fit_calls.max() == max_iter + 1 # Test the case where tol to large (no models finish) patience = 5 increase_after_patience = patience params = {"slope": 0 + 0.9 * rng.rand(1000)} search = IncrementalSearchCV( model, params, max_iter=max_iter, tol=increase_after_patience, patience=patience, fits_per_score=3, ) await search.fit(X, y, classes=[0, 1]) hist = pd.DataFrame(search.history_) assert hist.partial_fit_calls.max() == patience + 2
async def test_model_future(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=10) params = {"value": np.random.RandomState(42).rand(1000)} model = ConstantFunction() model_future = await c.scatter(model) search = IncrementalSearchCV(model_future, params, max_iter=10) await search.fit(X, y, classes=[0, 1]) assert search.history_ assert search.best_score_ > 0
def test_same_models_with_random_state(c, s, a, b): X, y = make_classification(n_samples=100, n_features=2, chunks=(10, 5), random_state=0) model = Incremental( SGDClassifier(tol=-np.inf, penalty="elasticnet", random_state=42, eta0=0.1)) params = { "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], "average": [True, False], "learning_rate": ["constant", "invscaling", "optimal"], "eta0": np.logspace(-2, 0, num=1000), } params = {"estimator__" + k: v for k, v in params.items()} search1 = IncrementalSearchCV(clone(model), params, n_initial_parameters=10, random_state=0) search2 = IncrementalSearchCV(clone(model), params, n_initial_parameters=10, random_state=0) yield search1.fit(X, y, classes=[0, 1]) yield search2.fit(X, y, classes=[0, 1]) assert search1.best_score_ == search2.best_score_ assert search1.best_params_ == search2.best_params_ assert np.allclose(search1.best_estimator_.coef_, search2.best_estimator_.coef_)
def test_high_performing_models_are_retained_with_patience(c, s, a, b): """ This tests covers a case when high performing models plateau before the search is finished. This covers the use case when one poor-performing model takes a long time to converge, but all other high-performing models have finished (and plateaued). Details ------- This test defines * low performing models that continue to improve * high performing models that are constant It uses a small tolerance to stop the constant (and high-performing) models. This test is only concerned with making sure the high-performing model is retained after it has reached a plateau. It is not concerned with making sure models are killed off at correct times. """ X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) params = {"final_score": [1, 2, 3, 4, 5]} search = IncrementalSearchCV( _MaybeLinearFunction(), params, patience=2, tol=1e-3, # only stop the constant functions decay_rate=0, n_initial_parameters="grid", max_iter=20, ) search._adapt = _remove_worst_performing_model yield search.fit(X, y) assert search.best_params_ == {"final_score": 5}
def test_same_random_state_same_params(c, s, a, b): # This makes sure parameters are sampled correctly when random state is # specified. # This test makes sure random state is *correctly* passed to successive # halvings from Hyperband seed = 0 values = scipy.stats.uniform(0, 1) h = HyperbandSearchCV(ConstantFunction(), {"value": values}, random_state=seed, max_iter=9) # Make a class for passive random sampling passive = IncrementalSearchCV( ConstantFunction(), {"value": values}, random_state=seed, max_iter=2, n_initial_parameters=h.metadata["n_models"], ) X, y = make_classification(n_samples=10, n_features=4, chunks=10) yield h.fit(X, y) yield passive.fit(X, y) # Check to make sure the Hyperbands found the same params v_h = h.cv_results_["param_value"] # Check to make sure the random passive search had *some* of the same params v_passive = passive.cv_results_["param_value"] # Sanity checks to make sure all unique floats assert len(set(v_passive)) == len(v_passive) assert len(set(v_h)) == len(v_h) # Getting the `value`s that are the same for both searches same = set(v_passive).intersection(set(v_h)) passive_models = h.metadata["brackets"][0]["n_models"] assert len(same) == passive_models
def test_same_params_with_random_state(c, s, a, b): X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) model = SGDClassifier(tol=1e-3, penalty="elasticnet") params = {"alpha": scipy.stats.uniform(1e-4, 1)} search1 = IncrementalSearchCV( model, params, n_initial_parameters=10, random_state=0 ) yield search1.fit(X, y, classes=[0, 1]) params1 = search1.cv_results_["param_alpha"] search2 = IncrementalSearchCV( model, params, n_initial_parameters=10, random_state=0 ) yield search2.fit(X, y, classes=[0, 1]) params2 = search2.cv_results_["param_alpha"] assert np.allclose(params1, params2)
def test_search_plateau_tol(c, s, a, b): class LinearFunction(BaseEstimator): def __init__(self, intercept=0, slope=1, foo=0): self._num_calls = 0 self.intercept = intercept self.slope = slope super(LinearFunction, self).__init__() def fit(self, *args): return self def partial_fit(self, *args, **kwargs): self._num_calls += 1 return self def score(self, *args, **kwargs): return self.intercept + self.slope * self._num_calls model = LinearFunction(slope=1) params = {"foo": np.linspace(0, 1)} # every 3 calls, score will increase by 3. tol=1: model did improved enough search = IncrementalSearchCV(model, params, patience=3, tol=1, max_iter=10, decay_rate=0) X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) yield search.fit(X, y) assert set(search.cv_results_["partial_fit_calls"]) == {10} # Every 3 calls, score increases by 3. tol=4: model didn't improve enough search = IncrementalSearchCV(model, params, patience=3, tol=4, decay_rate=0, max_iter=10) X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) yield search.fit(X, y) assert set(search.cv_results_["partial_fit_calls"]) == {3}
def test_history(c, s, a, b): X, y = make_classification(n_samples=10, n_features=4, chunks=10) model = ConstantFunction() params = {"value": scipy.stats.uniform(0, 1)} alg = IncrementalSearchCV(model, params, max_iter=9, random_state=42) yield alg.fit(X, y) gt_zero = lambda x: x >= 0 gt_one = lambda x: x >= 1 key_types_and_checks = [ ("mean_partial_fit_time", float, gt_zero), ("mean_score_time", float, gt_zero), ("std_partial_fit_time", float, gt_zero), ("std_score_time", float, gt_zero), ("test_score", float, gt_zero), ("rank_test_score", int, gt_one), ("model_id", int, None), ("partial_fit_calls", int, gt_zero), ("params", dict, lambda d: set(d.keys()) == {"value"}), ("param_value", float, gt_zero), ] assert set(alg.cv_results_) == {v[0] for v in key_types_and_checks} for column, dtype, condition in key_types_and_checks: if dtype: assert alg.cv_results_[column].dtype == dtype if condition: assert all(condition(x) for x in alg.cv_results_[column]) alg.best_estimator_.fit(X, y) alg.best_estimator_.score(X, y) alg.score(X, y) # Test types/format of all parameters we set after fitting assert isinstance(alg.best_index_, int) assert isinstance(alg.best_estimator_, ConstantFunction) assert isinstance(alg.best_score_, float) assert isinstance(alg.best_params_, dict) assert isinstance(alg.history_, list) assert all(isinstance(h, dict) for h in alg.history_) assert isinstance(alg.model_history_, dict) assert all(vi in alg.history_ for v in alg.model_history_.values() for vi in v) assert all(isinstance(v, np.ndarray) for v in alg.cv_results_.values()) assert isinstance(alg.multimetric_, bool) keys = { "score", "score_time", "partial_fit_calls", "partial_fit_time", "model_id", "elapsed_wall_time", "params", } assert all(set(h.keys()) == keys for h in alg.history_) times = [v["elapsed_wall_time"] for v in alg.history_] assert (np.diff(times) >= 0).all() # Test to make sure history_ ordered with wall time assert (np.diff([v["elapsed_wall_time"] for v in alg.history_]) >= 0).all() for model_hist in alg.model_history_.values(): calls = [h["partial_fit_calls"] for h in model_hist] assert (np.diff(calls) >= 1).all() or len(calls) == 1
def test_high_performing_models_are_retained_with_patience(c, s, a, b): """ This tests covers a case when high performing models plateau before the search is finished. This covers the use case when one poor-performing model takes a long time to converge, but all other high-performing models have finished (and plateaued). Details ------- This test defines * low performing models that continue to improve * high performing models that are constant It uses a small tolerance to stop the constant (and high-performing) models. This test is only concerned with making sure the high-performing model is retained after it has reached a plateau. It is not concerned with making sure models are killed off at correct times. """ class MaybeLinearFunction(BaseEstimator): def __init__(self, final_score=1): self.final_score = final_score self._calls = 0 def fit(self, X, y): return self def partial_fit(self, X, y): self._calls += 1 def score(self, X, y): if self.final_score <= 3: return self.final_score * (1 - 1 / (self._calls + 2)) return self.final_score X, y = make_classification(n_samples=100, n_features=5, chunks=(10, 5)) params = {"final_score": [1, 2, 3, 4, 5]} search = IncrementalSearchCV( MaybeLinearFunction(), params, patience=2, tol=1e-3, # only stop the constant functions decay_rate=0, n_initial_parameters="grid", max_iter=20, ) def remove_worst_performing_model(info): calls = {v[-1]["partial_fit_calls"] for v in info.values()} ests = {v[-1]["params"]["final_score"] for v in info.values()} if max(calls) == 1: assert all(x in ests for x in [1, 2, 3, 4, 5]) elif max(calls) == 2: assert all(x in ests for x in [2, 3, 4, 5]) assert all(x not in ests for x in [1]) elif max(calls) == 3: assert all(x in ests for x in [3, 4, 5]) assert all(x not in ests for x in [1, 2]) elif max(calls) == 4: assert all(x in ests for x in [4, 5]) assert all(x not in ests for x in [1, 2, 3]) elif max(calls) == 5: assert all(x in ests for x in [5]) assert all(x not in ests for x in [1, 2, 3, 4]) return {k: 0 for k in info.keys()} recent_scores = { k: v[-1]["score"] for k, v in info.items() if v[-1]["partial_fit_calls"] == max(calls) } return { k: 1 for k, v in recent_scores.items() if v > min(recent_scores.values()) } search._adapt = remove_worst_performing_model yield search.fit(X, y) assert search.best_params_ == {"final_score": 5}
def _test_search_basic(decay_rate, c, s, a, b): X, y = make_classification(n_samples=1000, n_features=5, chunks=(100, 5)) model = SGDClassifier(tol=1e-3, loss="log", penalty="elasticnet") params = { "alpha": np.logspace(-2, 2, 100), "l1_ratio": np.linspace(0.01, 1, 200) } search = IncrementalSearchCV(model, params, n_initial_parameters=20, max_iter=10, decay_rate=decay_rate) yield search.fit(X, y, classes=[0, 1]) assert search.history_ for d in search.history_: assert d["partial_fit_calls"] <= search.max_iter + 1 assert isinstance(search.best_estimator_, SGDClassifier) assert search.best_score_ > 0 assert "visualize" not in search.__dict__ assert search.best_params_ assert search.cv_results_ and isinstance(search.cv_results_, dict) assert { "mean_partial_fit_time", "mean_score_time", "std_partial_fit_time", "std_score_time", "test_score", "rank_test_score", "model_id", "params", "partial_fit_calls", "param_alpha", "param_l1_ratio", }.issubset(set(search.cv_results_.keys())) assert len(search.cv_results_["param_alpha"]) == 20 assert all(isinstance(v, np.ndarray) for v in search.cv_results_.values()) if decay_rate == 0: assert (search.cv_results_["test_score"][search.best_index_] >= search.cv_results_["test_score"]).all() assert search.cv_results_["rank_test_score"][search.best_index_] == 1 else: assert all(search.cv_results_["test_score"] >= 0) assert all(search.cv_results_["rank_test_score"] >= 1) assert all(search.cv_results_["partial_fit_calls"] >= 1) assert len(np.unique(search.cv_results_["model_id"])) == len( search.cv_results_["model_id"]) assert sorted(search.model_history_.keys()) == list(range(20)) assert set(search.model_history_[0][0].keys()) == { "model_id", "params", "partial_fit_calls", "partial_fit_time", "score", "score_time", "elapsed_wall_time", } X_, = yield c.compute([X]) # Dask Objects are lazy proba = search.predict_proba(X) log_proba = search.predict_log_proba(X) assert proba.shape == (1000, 2) assert log_proba.shape == (1000, 2) assert isinstance(proba, da.Array) assert isinstance(log_proba, da.Array) proba_ = search.predict_proba(X_) log_proba_ = search.predict_log_proba(X_) da.utils.assert_eq(proba, proba_) da.utils.assert_eq(log_proba, log_proba_) decision = search.decision_function(X_) assert decision.shape == (1000, )
def run(): client = Client() from dask_ml.datasets import make_classification df = dd.read_csv("isHealth.csv", assume_missing=True, sample=640000000, blocksize="10MB") df = df.fillna(0).fillna(0) for column in df.columns: if '.' in column: df = df.drop(column, axis=1) # for column in droppedColumns: # df = df.drop(column, axis=1) y = df['acquired'] X = df.drop('acquired', axis=1) from dask_ml.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1) # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train) x_test_tickers = X_test['ticker'].values.compute() x_test_dates = X_test['date'].values.compute() print(x_test_tickers[0]) np.savetxt("x_test_tickers.csv", [x_test_tickers, x_test_dates], delimiter=",", fmt='%s') np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s') print("GOOD") for column in X_train.columns: if 'ticker' in column or 'date' in column: X_train = X_train.drop(column, axis=1) X_test = X_test.drop(column, axis=1) X_train = X_train.to_dask_array() X_test = X_test.values.compute() y_train = y_train.to_dask_array() y_test = y_test.values.compute() np.savetxt("y_test.csv", y_test, delimiter=",") from dask_ml.wrappers import Incremental from sklearn.linear_model import SGDClassifier from sklearn.neural_network import MLPClassifier from dask_ml.wrappers import ParallelPostFit est = MLPClassifier(solver='adam', activation='relu', random_state=0) inc = Incremental(est, scoring='neg_log_loss') print("WORKING") for _ in range(10): inc.partial_fit(X_train, y_train, classes=[0, 1]) print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test)) # model = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,2),random_state=1) params = {'alpha': np.logspace(-2, 1, num=1000)} from dask_ml.model_selection import IncrementalSearchCV search = IncrementalSearchCV(est, params, n_initial_parameters=100, patience=20, max_iter=100) search.fit(X_train, y_train, classes=[0, 1]) print(search) print("SCORE") print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test))