def objective(trial): iris = load_iris() X, y = iris.data, iris.target X, y = da.from_array(X, chunks=len(X) // 5), da.from_array(y, chunks=len(y) // 5) solver = trial.suggest_categorical( 'solver', ['admm', 'gradient_descent', 'proximal_grad']) C = trial.suggest_uniform('C', 0.0, 1.0) if solver == 'admm' or solver == 'proximal_grad': penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elastic_net']) else: # 'penalty' parameter isn't relevant for this solver, # so we always specify 'l2' as the dummy value. penalty = 'l2' classifier = LogisticRegression(max_iter=200, solver=solver, C=C, penalty=penalty) X_train, X_test, y_train, y_test = train_test_split(X, y) classifier.fit(X_train, y_train) score = classifier.score(X_test, y_test) return score
def objective(trial): iris = load_iris() X, y = iris.data, iris.target X, y = da.from_array(X, chunks=len(X) // 5), da.from_array(y, chunks=len(y) // 5) solver = trial.suggest_categorical( "solver", ["admm", "gradient_descent", "proximal_grad"]) C = trial.suggest_float("C", 0.0, 1.0) if solver == "admm" or solver == "proximal_grad": penalty = trial.suggest_categorical("penalty", ["l1", "l2", "elastic_net"]) else: # 'penalty' parameter isn't relevant for this solver, # so we always specify 'l2' as the dummy value. penalty = "l2" classifier = LogisticRegression(max_iter=200, solver=solver, C=C, penalty=penalty) X_train, X_valid, y_train, y_valid = train_test_split(X, y) classifier.fit(X_train, y_train) score = classifier.score(X_valid, y_valid) return score
def test_big(fit_intercept): X, y = make_classification(chunks=50) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_fit_solver(solver): import dask_glm from distutils.version import LooseVersion if LooseVersion(dask_glm.__version__) <= "0.2.0": pytest.skip("FutureWarning for dask config.") X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(solver=solver) lr.fit(X, y)
def test_fit_solver(solver): import dask_glm import packaging.version if packaging.version.parse( dask_glm.__version__) <= packaging.version.parse("0.2.0"): pytest.skip("FutureWarning for dask config.") X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(solver=solver) lr.fit(X, y)
def simple_example(): X, y = make_classification(n_samples=10000, n_features=2, chunks=50) X = dd.from_dask_array(X, columns=["a","b"]) y = dd.from_array(y) lr = LogisticRegression() lr.fit(X.values, y.values) print('Predictions =', lr.predict(X.values).compute()) print('Probabilities =', lr.predict_proba(X.values).compute()) print('Scores =', lr.score(X.values, y.values).compute())
def train(X_train, y_train, out_model): lr = LogisticRegression(penalty='l2', solver='lbfgs', n_jobs=64, max_iter=10) # If leave just the dataframe, will throw an error saying "This estimator # does not support dask dataframes." lr.fit(X_train.values, y_train.values) # Saving model for later prediction pickle.dump(lr, open(out_model, "wb")) # Outputing some statistics y_train_pred = lr.predict(X_train.values) TN, FP, FN, TP = confusion_matrix_dask(y_train.values, y_train_pred) print("Read like \n[[TN, FP], \n[FN, TP]]\n", np.array([[TN, FP], [FN, TP]]))
def test_dataframe_warns_about_chunks(fit_intercept): rng = np.random.RandomState(42) n, d = 20, 5 kwargs = dict(npartitions=4) X = dd.from_pandas(pd.DataFrame(rng.uniform(size=(n, d))), **kwargs) y = dd.from_pandas(pd.Series(rng.choice(2, size=n)), **kwargs) clf = LogisticRegression(fit_intercept=fit_intercept) msg = "does not support dask dataframes.*might be resolved with" with pytest.raises(TypeError, match=msg): clf.fit(X, y) clf.fit(X.values, y.values) clf.fit(X.to_dask_array(), y.to_dask_array()) clf.fit(X.to_dask_array(lengths=True), y.to_dask_array(lengths=True))
def test_fit(fit_intercept, solver): X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X)
def test_logistic_predict_proba_shape(): X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression() lr.fit(X, y) prob = lr.predict_proba(X) assert prob.shape == (100, 2)
import dask.dataframe as dd import dask.datasets as ds import time from dask_ml.linear_model import LogisticRegression from dask_glm.datasets import make_classification X, y = make_classification(n_samples=1000) t = time.time() lr = LogisticRegression() lr.fit(X, y) lr.predict(X) lr.predict_proba(X) #est.score(X, y) print('\nTime dask_ml: ' + str(time.time() - t)) # Parallelize Scikit-Learn Directly from dask.distributed import Client from sklearn.externals.joblib import parallel_backend client = Client('localhost:8786') # Connect to a Dask Cluster print(client) with parallel_backend('dask', scatter=[X, y]): # Your normal scikit-learn code here t = time.time() lr = LogisticRegression() lr.fit(X, y) lr.predict(X) lr.predict_proba(X) #est.score(X, y) print('\nTime dask_ml distributed: ' + str(time.time() - t))
def test_fit_solver(solver): X, y = make_classification(n_samples=100, n_features=5, chunks=50) lr = LogisticRegression(solver=solver) lr.fit(X, y)
# Test-train split from dask_ml.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(to_dask_array(X), to_dask_array(y), random_state=99) ################################################################################### # Fitting the Logistic Regression Classifier from dask_ml.linear_model import LogisticRegression lr = LogisticRegression() with ProgressBar(): lr.fit(X_train, y_train) print('Logistic Regression Score : ', lr.score(X_test, y_test).compute()) ##### OUTPUT --------> Logistic Regression Score : 0.70025 ##################################################################################### # Fitting the Naive Bayes Classifier from sklearn.naive_bayes import BernoulliNB from dask_ml.wrappers import Incremental nb = BernoulliNB() parallel_nb = Incremental(nb) with ProgressBar():