def test_set_params(): clf = Incremental(SGDClassifier()) clf.set_params(**{'scoring': 'accuracy', 'estimator__max_iter': 20}) result = clf.get_params() assert result['estimator__max_iter'] == 20 assert result['scoring'] == 'accuracy'
def test_set_params(): clf = Incremental(SGDClassifier()) clf.set_params(**{"scoring": "accuracy", "estimator__max_iter": 20}) result = clf.get_params() assert result["estimator__max_iter"] == 20 assert result["scoring"] == "accuracy"
def test_scoring(scheduler, xy_classification, scoring=dask_ml.metrics.accuracy_score): X, y = xy_classification with scheduler() as (s, [a, b]): clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring) with pytest.raises(ValueError, match='metric function rather than a scorer'): clf.fit(X, y, classes=np.unique(y))
def test_fit_rechunking(): n_classes = 2 X, y = make_classification(chunks=20, n_classes=n_classes) X = X.rechunk({1: 10}) assert X.numblocks[1] > 1 clf = Incremental(SGDClassifier(max_iter=5, tol=1e-3)) clf.fit(X, y, classes=list(range(n_classes)))
def test_incremental_basic(scheduler, xy_classification): X, y = xy_classification with scheduler() as (s, [a, b]): est1 = SGDClassifier(random_state=0, tol=1e-3) est2 = clone(est1) clf = Incremental(est1) result = clf.fit(X, y, classes=[0, 1]) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1]) assert result is clf assert isinstance(result.estimator.coef_, np.ndarray) np.testing.assert_array_almost_equal(result.estimator.coef_, est2.coef_) assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_']) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) assert_eq(result, expected) # score result = clf.score(X, y) expected = est2.score(X, y) # assert isinstance(result, da.Array) assert_eq(result, expected) clf = Incremental(SGDClassifier(random_state=0, tol=1e-3)) clf.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])
def test_fit_ndarrays(): X = np.ones((10, 5)) y = np.concatenate([np.zeros(5), np.ones(5)]) sgd = SGDClassifier(tol=1e-3) inc = Incremental(sgd) inc.partial_fit(X, y, classes=[0, 1]) sgd.fit(X, y) assert inc.estimator is sgd assert_eq(inc.coef_, inc.estimator_.coef_)
def test_estimator_param_raises(): class Dummy(sklearn.base.BaseEstimator): def __init__(self, estimator=42): self.estimator = estimator def fit(self, X): return self clf = Incremental(Dummy(estimator=1)) with pytest.raises(ValueError, match='used by both'): clf.get_params()
def test_incremental_basic(scheduler, dataframes): # Create observations that we know linear models can recover n, d = 100, 3 rng = da.random.RandomState(42) X = rng.normal(size=(n, d), chunks=30) coef_star = rng.uniform(size=d, chunks=d) y = da.sign(X.dot(coef_star)) y = (y + 1) / 2 if dataframes: X = dd.from_array(X) y = dd.from_array(y) with scheduler() as (s, [_, _]): est1 = SGDClassifier(random_state=0, tol=1e-3, average=True) est2 = clone(est1) clf = Incremental(est1, random_state=0) result = clf.fit(X, y, classes=[0, 1]) assert result is clf # est2 is a sklearn optimizer; this is just a benchmark if dataframes: X = X.to_dask_array(lengths=True) y = y.to_dask_array(lengths=True) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_].compute(), y[slice_[0]].compute(), classes=[0, 1]) assert isinstance(result.estimator_.coef_, np.ndarray) rel_error = np.linalg.norm(clf.coef_ - est2.coef_) rel_error /= np.linalg.norm(clf.coef_) assert rel_error < 0.9 assert set(dir(clf.estimator_)) == set(dir(est2)) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) if dataframes: # Compute is needed because chunk sizes of this array are unknown result = result.compute() rel_error = np.linalg.norm(result - expected) rel_error /= np.linalg.norm(expected) assert rel_error < 0.3 # score result = clf.score(X, y) expected = est2.score(*dask.compute(X, y)) assert abs(result - expected) < 0.1 clf = Incremental(SGDClassifier(random_state=0, tol=1e-3, average=True)) clf.partial_fit(X, y, classes=[0, 1]) assert set(dir(clf.estimator_)) == set(dir(est2))
def test_score(xy_classification): distributed = pytest.importorskip("distributed") client = distributed.Client(n_workers=2) X, y = xy_classification inc = Incremental(SGDClassifier(max_iter=1000, random_state=0), scoring="accuracy") with client: inc.fit(X, y, classes=[0, 1]) result = inc.score(X, y) expected = inc.estimator_.score(X, y) assert result == expected
def test_in_gridsearch(scheduler, xy_classification): X, y = xy_classification with scheduler() as (s, [a, b]): clf = Incremental(SGDClassifier(random_state=0, tol=1e-3)) param_grid = {'alpha': [0.1, 10]} gs = sklearn.model_selection.GridSearchCV(clf, param_grid, iid=False) gs.fit(X, y, classes=[0, 1])
def test_same_models_with_random_state(c, s, a, b): X, y = make_classification(n_samples=100, n_features=2, chunks=(10, 5), random_state=0) model = Incremental( SGDClassifier(tol=-np.inf, penalty="elasticnet", random_state=42, eta0=0.1)) params = { "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], "average": [True, False], "learning_rate": ["constant", "invscaling", "optimal"], "eta0": np.logspace(-2, 0, num=1000), } params = {"estimator__" + k: v for k, v in params.items()} search1 = IncrementalSearchCV(clone(model), params, n_initial_parameters=10, random_state=0) search2 = IncrementalSearchCV(clone(model), params, n_initial_parameters=10, random_state=0) yield search1.fit(X, y, classes=[0, 1]) yield search2.fit(X, y, classes=[0, 1]) assert search1.best_score_ == search2.best_score_ assert search1.best_params_ == search2.best_params_ assert np.allclose(search1.best_estimator_.coef_, search2.best_estimator_.coef_)
def test_incremental_text_pipeline(container): X = pd.Series(["a list", "of words", "for classification"] * 100) X = dd.from_pandas(X, npartitions=3) if container == "bag": X = X.to_bag() y = da.from_array(np.array([0, 0, 1] * 100), chunks=(100,) * 3) assert tuple(X.map_partitions(len).compute()) == y.chunks[0] sgd = SGDClassifier(max_iter=5, tol=1e-3) clf = Incremental(sgd, scoring="accuracy", assume_equal_chunks=True) vect = dask_ml.feature_extraction.text.HashingVectorizer() pipe = make_pipeline(vect, clf) pipe.fit(X, y, incremental__classes=[0, 1]) X2 = pipe.steps[0][1].transform(X) assert hasattr(clf, "coef_") X2.compute_chunk_sizes() assert X2.shape == (300, vect.n_features) preds = pipe.predict(X).compute() assert len(y) == len(preds)
def run_on_blobs(): x, y = dask_ml.datasets.make_blobs(n_samples=1e8, chunks=1e5, random_state=0, centers=3) x = dd.dataframe.from_array(x) y = dd.dataframe.from_array(y) print(f"Rows: {x.shape[0].compute()}") ests_per_chunk = 4 chunks = len(x.divisions) srfc = Incremental(StreamingRFC(n_estimators_per_chunk=ests_per_chunk, max_n_estimators=np.inf, verbose=1, n_jobs=4)) srfc.fit(x, y, classes=y.unique().compute())
def test_in_gridsearch(scheduler, xy_classification): X, y = xy_classification clf = Incremental(SGDClassifier(random_state=0, tol=1e-3)) param_grid = {"estimator__alpha": [0.1, 10]} if SK_022: kwargs = {} else: kwargs = {"iid": False} gs = sklearn.model_selection.GridSearchCV(clf, param_grid, cv=3, **kwargs) with scheduler() as (s, [a, b]): gs.fit(X, y, classes=[0, 1])
def test_scoring_string(scheduler, xy_classification, scoring): X, y = xy_classification with scheduler() as (s, [a, b]): clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring) assert callable(check_scoring(clf, scoring=scoring)) clf.fit(X, y, classes=np.unique(y)) clf.score(X, y)
def test_incremental_basic(scheduler): # Create observations that we know linear models can recover n, d = 100, 3 rng = da.random.RandomState(42) X = rng.normal(size=(n, d), chunks=30) coef_star = rng.uniform(size=d, chunks=d) y = da.sign(X.dot(coef_star)) y = (y + 1) / 2 with scheduler() as (s, [_, _]): est1 = SGDClassifier(random_state=0, tol=1e-3, average=True) est2 = clone(est1) clf = Incremental(est1, random_state=0) result = clf.fit(X, y, classes=[0, 1]) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1]) assert result is clf assert isinstance(result.estimator_.coef_, np.ndarray) rel_error = np.linalg.norm(clf.coef_ - est2.coef_) rel_error /= np.linalg.norm(clf.coef_) assert rel_error < 0.9 assert set(dir(clf.estimator_)) == set(dir(est2)) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) rel_error = np.linalg.norm(result - expected) rel_error /= np.linalg.norm(expected) assert rel_error < 0.2 # score result = clf.score(X, y) expected = est2.score(X, y) assert abs(result - expected) < 0.1 clf = Incremental(SGDClassifier(random_state=0, tol=1e-3, average=True)) clf.partial_fit(X, y, classes=[0, 1]) assert set(dir(clf.estimator_)) == set(dir(est2))
def test_score_ndarrays(): X = np.ones((10, 5)) y = np.ones(10) sgd = SGDClassifier(tol=1e-3) inc = Incremental(sgd, scoring="accuracy") inc.partial_fit(X, y, classes=[0, 1]) inc.fit(X, y, classes=[0, 1]) assert inc.score(X, y) == 1 dX = da.from_array(X, chunks=(2, 5)) dy = da.from_array(y, chunks=2) assert inc.score(dX, dy) == 1
def setUpClass(cls): """Set up model to test.""" cls = cls._prep_data(cls) cls.mod = Incremental( StreamingRFC(n_estimators_per_chunk=20, n_jobs=-1, max_n_estimators=np.inf, verbose=1)) # Set expected number of estimators cls.expected_n_estimators = 200 # Set helper values super().setUpClass()
def setUpClass(cls): """Set up model to test.""" cls = cls._prep_data(cls) cls.mod = Incremental( StreamingRFC(n_estimators_per_chunk=1, max_n_estimators=39, verbose=1)) # Set expected number of estimators # This should be set manually depending on data. cls.expected_n_estimators = 10 # Set helper values super().setUpClass()
def test_scoring_string(scheduler, xy_classification, scoring): X, y = xy_classification with scheduler() as (s, [a, b]): clf = Incremental(SGDClassifier(), scoring=scoring) if scoring: make_scorer(clf.scoring) == dask_ml.metrics.scorer.SCORERS[scoring] clf.fit(X, y, classes=np.unique(y)) clf.score(X, y) clf.estimator.score(X, y)
def run3(): client = Client() from dask_ml.datasets import make_classification df = dd.read_csv("isHealthTrain.csv", assume_missing=True, sample=640000000, blocksize="10MB") df = df.fillna(0).fillna(0) for column in df.columns: if '.' in column: df = df.drop(column, axis=1) # for column in droppedColumns: # df = df.drop(column, axis=1) y_train = df['acquired'] X_train = df.drop('acquired', axis=1) df2 = dd.read_csv("isHealthTest.csv", assume_missing=True, sample=640000000, blocksize="10MB") df2 = df2.fillna(0).fillna(0) for column in df2.columns: if '.' in column: df2 = df2.drop(column, axis=1) # for column in droppedColumns: # df = df.drop(column, axis=1) y_test = df2['acquired'] X_test = df2.drop('acquired', axis=1) # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train) x_test_tickers = X_test['ticker'].values.compute() x_test_dates = X_test['date'].values.compute() print(x_test_tickers[0]) np.savetxt("x_test_tickers.csv", x_test_tickers, delimiter=",", fmt='%s') np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s') print("GOOD") for column in X_train.columns: if 'ticker' in column or 'date' in column: X_train = X_train.drop(column, axis=1) X_test = X_test.drop(column, axis=1) X_train = X_train.to_dask_array() X_test = X_test.values.compute() y_train = y_train.to_dask_array() y_test = y_test.values.compute() np.savetxt("y_test.csv", y_test, delimiter=",") from dask_ml.wrappers import Incremental from sklearn.linear_model import SGDClassifier from sklearn.neural_network import MLPClassifier from dask_ml.wrappers import ParallelPostFit est = MLPClassifier(solver='adam', activation='relu', random_state=0) print(est) inc = Incremental(est, scoring='f1') print("WORKING") for _ in range(10): inc.partial_fit(X_train, y_train, classes=[0, 1]) print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test))
def setUpClass(cls): """Set up model to test.""" cls = cls._prep_data(cls, reg=True) cls.mod = Incremental( StreamingRFR(n_estimators_per_chunk=1, n_jobs=-1, max_n_estimators=np.inf, max_features=cls.x.shape[1], verbose=1)) # Set expected number of estimators cls.expected_n_estimators = 10 # Set helper values super().setUpClass()
def test_scoring_string(scheduler, xy_classification, scoring): X, y = xy_classification with scheduler() as (s, [a, b]): clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring) if scoring: assert (dask_ml.metrics.scorer.SCORERS[scoring] == check_scoring( clf, scoring=scoring)) assert callable(check_scoring(clf, scoring=scoring)) clf.fit(X, y, classes=np.unique(y)) clf.score(X, y)
def test_replace_scoring(estimator, fit_kwargs, scoring, xy_classification, mocker): X, y = xy_classification inc = Incremental(estimator(max_iter=1000, random_state=0, tol=1e-3)) inc.fit(X, y, **fit_kwargs) patch = mocker.patch.object(dask_ml.wrappers, "get_scorer") with patch: inc.score(X, y) assert patch.call_count == 1 patch.assert_called_with(scoring, compute=True)
def run(): client = Client() from dask_ml.datasets import make_classification df = dd.read_csv("isHealth.csv", assume_missing=True, sample=640000000, blocksize="10MB") df = df.fillna(0).fillna(0) for column in df.columns: if '.' in column: df = df.drop(column, axis=1) # for column in droppedColumns: # df = df.drop(column, axis=1) y = df['acquired'] X = df.drop('acquired', axis=1) from dask_ml.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1) # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train) x_test_tickers = X_test['ticker'].values.compute() x_test_dates = X_test['date'].values.compute() print(x_test_tickers[0]) np.savetxt("x_test_tickers.csv", [x_test_tickers, x_test_dates], delimiter=",", fmt='%s') np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s') print("GOOD") for column in X_train.columns: if 'ticker' in column or 'date' in column: X_train = X_train.drop(column, axis=1) X_test = X_test.drop(column, axis=1) X_train = X_train.to_dask_array() X_test = X_test.values.compute() y_train = y_train.to_dask_array() y_test = y_test.values.compute() np.savetxt("y_test.csv", y_test, delimiter=",") from dask_ml.wrappers import Incremental from sklearn.linear_model import SGDClassifier from sklearn.neural_network import MLPClassifier from dask_ml.wrappers import ParallelPostFit est = MLPClassifier(solver='adam', activation='relu', random_state=0) inc = Incremental(est, scoring='neg_log_loss') print("WORKING") for _ in range(10): inc.partial_fit(X_train, y_train, classes=[0, 1]) print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test)) # model = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,2),random_state=1) params = {'alpha': np.logspace(-2, 1, num=1000)} from dask_ml.model_selection import IncrementalSearchCV search = IncrementalSearchCV(est, params, n_initial_parameters=100, patience=20, max_iter=100) search.fit(X_train, y_train, classes=[0, 1]) print(search) print("SCORE") print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test))
with ProgressBar(): lr.fit(X_train, y_train) print('Logistic Regression Score : ', lr.score(X_test, y_test).compute()) ##### OUTPUT --------> Logistic Regression Score : 0.70025 ##################################################################################### # Fitting the Naive Bayes Classifier from sklearn.naive_bayes import BernoulliNB from dask_ml.wrappers import Incremental nb = BernoulliNB() parallel_nb = Incremental(nb) with ProgressBar(): parallel_nb.fit(X_train, y_train, classes=np.unique(y_train.compute())) print('\n\nNaive Bayes Classifier Score : ', parallel_nb.score(X_test, y_test)) ##### OUTPUT --------> Naive Bayes Classifier Score : 0.65 ###################################################################################### # Performing GridSearch on the Logistic Regression Classifier from dask_ml.model_selection import GridSearchCV parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]} lr = LogisticRegression()
def test_get_params(): clf = Incremental(SGDClassifier()) result = clf.get_params() assert "estimator__max_iter" in result assert result["scoring"] is None
features = FEATURES_ARRAY # [OUTPUT_FOLDER + 'lbp' + FORMAT]: # # for feature in features: for feature in [OUTPUT_FOLDER + 'lbp' + FORMAT]: # features: print(""" ---------------------------------- getting feature: {} """.format(feature)) X = np.load(feature, allow_pickle=True) X = to_arr(X) np.save('lbp_arr', X) X = da.from_array(X, chunks=X.shape) X = transformer_pipe.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) classes = da.unique(y_train).compute() "One model for all" inc = Incremental(SVC(random_state=RANDOM_STATE), scoring='accuracy') for _ in range(10): inc.partial_fit(X_train, y_train, classes=classes) print('Score:', inc.score(X_test, y_test)) score = inc.score(X_test, y_test) print(score) np.save('lbp_svm', score) time_it()
def _test_basic(c, s, a, b): rng = da.random.RandomState(42) n, d = (50, 2) # create observations we know linear models can fit X = rng.normal(size=(n, d), chunks=n // 2) coef_star = rng.uniform(size=d, chunks=d) y = da.sign(X.dot(coef_star)) if array_type == "numpy": X, y = yield c.compute((X, y)) params = { "loss": ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], "average": [True, False], "learning_rate": ["constant", "invscaling", "optimal"], "eta0": np.logspace(-2, 0, num=1000), } model = SGDClassifier(tol=-np.inf, penalty="elasticnet", random_state=42, eta0=0.1) if library == "dask-ml": model = Incremental(model) params = {"estimator__" + k: v for k, v in params.items()} elif library == "ConstantFunction": model = ConstantFunction() params = {"value": np.linspace(0, 1, num=1000)} search = HyperbandSearchCV(model, params, max_iter=max_iter, random_state=42) classes = c.compute(da.unique(y)) yield search.fit(X, y, classes=classes) if library == "dask-ml": X, y = yield c.compute((X, y)) score = search.best_estimator_.score(X, y) assert score == search.score(X, y) assert 0 <= score <= 1 if library == "ConstantFunction": assert score == search.best_score_ else: # These are not equal because IncrementalSearchCV uses a train/test # split and we're testing on the entire train dataset, not only the # validation/test set. assert abs(score - search.best_score_) < 0.1 assert type(search.best_estimator_) == type(model) assert isinstance(search.best_params_, dict) num_fit_models = len(set(search.cv_results_["model_id"])) num_pf_calls = sum([ v[-1]["partial_fit_calls"] for v in search.model_history_.values() ]) models = {9: 17, 15: 17, 20: 17, 27: 49, 30: 49, 81: 143} pf_calls = {9: 69, 15: 101, 20: 144, 27: 357, 30: 379, 81: 1581} assert num_fit_models == models[max_iter] assert num_pf_calls == pf_calls[max_iter] best_idx = search.best_index_ if isinstance(model, ConstantFunction): assert search.cv_results_["test_score"][best_idx] == max( search.cv_results_["test_score"]) model_ids = {h["model_id"] for h in search.history_} if math.log(max_iter, 3) % 1.0 == 0: # log(max_iter, 3) % 1.0 == 0 is the good case when max_iter is a # power of search.aggressiveness # In this case, assert that more models are tried then the max_iter assert len(model_ids) > max_iter else: # Otherwise, give some padding "almost as many estimators are tried # as max_iter". 3 is a fudge number chosen to be the minimum; when # max_iter=20, len(model_ids) == 17. assert len(model_ids) + 3 >= max_iter assert all("bracket" in id_ for id_ in model_ids)
#Create one single learner instance to be used throughout this code block instead of refitting everytime mlpPD = MLPClassifier(hidden_layer_sizes=(5, 5), max_iter=300, activation='relu', solver='adam', learning_rate_init=0.001, beta_1=0.5, alpha=0.01, shuffle=True) #rfPD = BernoulliNB(alpha=0.5, binarize=0.0, fit_prior=True, class_prior=None) #rfPD = RandomForestClassifier(n_estimators=100,random_state=RSEED,max_features='sqrt', # n_jobs=-1, warm_start=True) ##Wrap learner in Incremental. Use this from now on as model. Will help with batching learnermlpPD = Incremental(mlpPD) #learnerrfPD = Incremental(rfPD) #Need to get encoded fit for the first set of data and apply to all other months. #Refitting each time causes errors df = pd.read_csv(filenames[0]) #df['FL_DATE'] = df['FL_DATE'].astype(str) df['OP_UNIQUE_CARRIER'] = df['OP_UNIQUE_CARRIER'].astype(str) df['ORIGIN'] = df['ORIGIN'].astype(str) #df['DEP_TIME'] = df['DEP_TIME'].astype(str) x_PD = df[['FL_DATE', 'DEP_TIME', 'OP_UNIQUE_CARRIER', 'ORIGIN']] #'DATE_INT','DEP_HOUR' y_PD = df['DEP_DELAY_IND'] x_trainPD, x_testPD, y_trainPD, y_testPD = train_test_split(x_PD, y_PD, test_size=0.10,