def test_incremental_basic(scheduler, xy_classification): X, y = xy_classification with scheduler() as (s, [a, b]): est1 = SGDClassifier(random_state=0, tol=1e-3) est2 = clone(est1) clf = Incremental(est1) result = clf.fit(X, y, classes=[0, 1]) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1]) assert result is clf assert isinstance(result.estimator.coef_, np.ndarray) np.testing.assert_array_almost_equal(result.estimator.coef_, est2.coef_) assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_']) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) assert_eq(result, expected) # score result = clf.score(X, y) expected = est2.score(X, y) # assert isinstance(result, da.Array) assert_eq(result, expected) clf = Incremental(SGDClassifier(random_state=0, tol=1e-3)) clf.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])
def run3(): client = Client() from dask_ml.datasets import make_classification df = dd.read_csv("isHealthTrain.csv", assume_missing=True, sample=640000000, blocksize="10MB") df = df.fillna(0).fillna(0) for column in df.columns: if '.' in column: df = df.drop(column, axis=1) # for column in droppedColumns: # df = df.drop(column, axis=1) y_train = df['acquired'] X_train = df.drop('acquired', axis=1) df2 = dd.read_csv("isHealthTest.csv", assume_missing=True, sample=640000000, blocksize="10MB") df2 = df2.fillna(0).fillna(0) for column in df2.columns: if '.' in column: df2 = df2.drop(column, axis=1) # for column in droppedColumns: # df = df.drop(column, axis=1) y_test = df2['acquired'] X_test = df2.drop('acquired', axis=1) # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train) x_test_tickers = X_test['ticker'].values.compute() x_test_dates = X_test['date'].values.compute() print(x_test_tickers[0]) np.savetxt("x_test_tickers.csv", x_test_tickers, delimiter=",", fmt='%s') np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s') print("GOOD") for column in X_train.columns: if 'ticker' in column or 'date' in column: X_train = X_train.drop(column, axis=1) X_test = X_test.drop(column, axis=1) X_train = X_train.to_dask_array() X_test = X_test.values.compute() y_train = y_train.to_dask_array() y_test = y_test.values.compute() np.savetxt("y_test.csv", y_test, delimiter=",") from dask_ml.wrappers import Incremental from sklearn.linear_model import SGDClassifier from sklearn.neural_network import MLPClassifier from dask_ml.wrappers import ParallelPostFit est = MLPClassifier(solver='adam', activation='relu', random_state=0) print(est) inc = Incremental(est, scoring='f1') print("WORKING") for _ in range(10): inc.partial_fit(X_train, y_train, classes=[0, 1]) print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test))
def test_incremental_basic(scheduler, dataframes): # Create observations that we know linear models can recover n, d = 100, 3 rng = da.random.RandomState(42) X = rng.normal(size=(n, d), chunks=30) coef_star = rng.uniform(size=d, chunks=d) y = da.sign(X.dot(coef_star)) y = (y + 1) / 2 if dataframes: X = dd.from_array(X) y = dd.from_array(y) with scheduler() as (s, [_, _]): est1 = SGDClassifier(random_state=0, tol=1e-3, average=True) est2 = clone(est1) clf = Incremental(est1, random_state=0) result = clf.fit(X, y, classes=[0, 1]) assert result is clf # est2 is a sklearn optimizer; this is just a benchmark if dataframes: X = X.to_dask_array(lengths=True) y = y.to_dask_array(lengths=True) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_].compute(), y[slice_[0]].compute(), classes=[0, 1]) assert isinstance(result.estimator_.coef_, np.ndarray) rel_error = np.linalg.norm(clf.coef_ - est2.coef_) rel_error /= np.linalg.norm(clf.coef_) assert rel_error < 0.9 assert set(dir(clf.estimator_)) == set(dir(est2)) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) if dataframes: # Compute is needed because chunk sizes of this array are unknown result = result.compute() rel_error = np.linalg.norm(result - expected) rel_error /= np.linalg.norm(expected) assert rel_error < 0.3 # score result = clf.score(X, y) expected = est2.score(*dask.compute(X, y)) assert abs(result - expected) < 0.1 clf = Incremental(SGDClassifier(random_state=0, tol=1e-3, average=True)) clf.partial_fit(X, y, classes=[0, 1]) assert set(dir(clf.estimator_)) == set(dir(est2))
def test_fit_ndarrays(): X = np.ones((10, 5)) y = np.concatenate([np.zeros(5), np.ones(5)]) sgd = SGDClassifier(tol=1e-3) inc = Incremental(sgd) inc.partial_fit(X, y, classes=[0, 1]) sgd.fit(X, y) assert inc.estimator is sgd assert_eq(inc.coef_, inc.estimator_.coef_)
def test_score_ndarrays(): X = np.ones((10, 5)) y = np.ones(10) sgd = SGDClassifier(tol=1e-3) inc = Incremental(sgd, scoring="accuracy") inc.partial_fit(X, y, classes=[0, 1]) inc.fit(X, y, classes=[0, 1]) assert inc.score(X, y) == 1 dX = da.from_array(X, chunks=(2, 5)) dy = da.from_array(y, chunks=2) assert inc.score(dX, dy) == 1
def test_incremental_basic(scheduler): # Create observations that we know linear models can recover n, d = 100, 3 rng = da.random.RandomState(42) X = rng.normal(size=(n, d), chunks=30) coef_star = rng.uniform(size=d, chunks=d) y = da.sign(X.dot(coef_star)) y = (y + 1) / 2 with scheduler() as (s, [_, _]): est1 = SGDClassifier(random_state=0, tol=1e-3, average=True) est2 = clone(est1) clf = Incremental(est1, random_state=0) result = clf.fit(X, y, classes=[0, 1]) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1]) assert result is clf assert isinstance(result.estimator_.coef_, np.ndarray) rel_error = np.linalg.norm(clf.coef_ - est2.coef_) rel_error /= np.linalg.norm(clf.coef_) assert rel_error < 0.9 assert set(dir(clf.estimator_)) == set(dir(est2)) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) rel_error = np.linalg.norm(result - expected) rel_error /= np.linalg.norm(expected) assert rel_error < 0.2 # score result = clf.score(X, y) expected = est2.score(X, y) assert abs(result - expected) < 0.1 clf = Incremental(SGDClassifier(random_state=0, tol=1e-3, average=True)) clf.partial_fit(X, y, classes=[0, 1]) assert set(dir(clf.estimator_)) == set(dir(est2))
def run(): client = Client() from dask_ml.datasets import make_classification df = dd.read_csv("isHealth.csv", assume_missing=True, sample=640000000, blocksize="10MB") df = df.fillna(0).fillna(0) for column in df.columns: if '.' in column: df = df.drop(column, axis=1) # for column in droppedColumns: # df = df.drop(column, axis=1) y = df['acquired'] X = df.drop('acquired', axis=1) from dask_ml.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1) # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train) x_test_tickers = X_test['ticker'].values.compute() x_test_dates = X_test['date'].values.compute() print(x_test_tickers[0]) np.savetxt("x_test_tickers.csv", [x_test_tickers, x_test_dates], delimiter=",", fmt='%s') np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s') print("GOOD") for column in X_train.columns: if 'ticker' in column or 'date' in column: X_train = X_train.drop(column, axis=1) X_test = X_test.drop(column, axis=1) X_train = X_train.to_dask_array() X_test = X_test.values.compute() y_train = y_train.to_dask_array() y_test = y_test.values.compute() np.savetxt("y_test.csv", y_test, delimiter=",") from dask_ml.wrappers import Incremental from sklearn.linear_model import SGDClassifier from sklearn.neural_network import MLPClassifier from dask_ml.wrappers import ParallelPostFit est = MLPClassifier(solver='adam', activation='relu', random_state=0) inc = Incremental(est, scoring='neg_log_loss') print("WORKING") for _ in range(10): inc.partial_fit(X_train, y_train, classes=[0, 1]) print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test)) # model = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,2),random_state=1) params = {'alpha': np.logspace(-2, 1, num=1000)} from dask_ml.model_selection import IncrementalSearchCV search = IncrementalSearchCV(est, params, n_initial_parameters=100, patience=20, max_iter=100) search.fit(X_train, y_train, classes=[0, 1]) print(search) print("SCORE") print("FITTED") np.savetxt("predictions.csv", inc.predict_proba(X_test)) print('Score:', inc.score(X_test, y_test))
features = FEATURES_ARRAY # [OUTPUT_FOLDER + 'lbp' + FORMAT]: # # for feature in features: for feature in [OUTPUT_FOLDER + 'lbp' + FORMAT]: # features: print(""" ---------------------------------- getting feature: {} """.format(feature)) X = np.load(feature, allow_pickle=True) X = to_arr(X) np.save('lbp_arr', X) X = da.from_array(X, chunks=X.shape) X = transformer_pipe.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) classes = da.unique(y_train).compute() "One model for all" inc = Incremental(SVC(random_state=RANDOM_STATE), scoring='accuracy') for _ in range(10): inc.partial_fit(X_train, y_train, classes=classes) print('Score:', inc.score(X_test, y_test)) score = inc.score(X_test, y_test) print(score) np.save('lbp_svm', score) time_it()