Esempio n. 1
0
def objective(trial):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = da.from_array(X,
                         chunks=len(X) // 5), da.from_array(y,
                                                            chunks=len(y) // 5)

    solver = trial.suggest_categorical(
        'solver', ['admm', 'gradient_descent', 'proximal_grad'])
    C = trial.suggest_uniform('C', 0.0, 1.0)

    if solver == 'admm' or solver == 'proximal_grad':
        penalty = trial.suggest_categorical('penalty',
                                            ['l1', 'l2', 'elastic_net'])
    else:
        # 'penalty' parameter isn't relevant for this solver,
        # so we always specify 'l2' as the dummy value.
        penalty = 'l2'

    classifier = LogisticRegression(max_iter=200,
                                    solver=solver,
                                    C=C,
                                    penalty=penalty)

    X_train, X_test, y_train, y_test = train_test_split(X, y)
    classifier.fit(X_train, y_train)

    score = classifier.score(X_test, y_test)
    return score
def objective(trial):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = da.from_array(X,
                         chunks=len(X) // 5), da.from_array(y,
                                                            chunks=len(y) // 5)

    solver = trial.suggest_categorical(
        "solver", ["admm", "gradient_descent", "proximal_grad"])
    C = trial.suggest_float("C", 0.0, 1.0)

    if solver == "admm" or solver == "proximal_grad":
        penalty = trial.suggest_categorical("penalty",
                                            ["l1", "l2", "elastic_net"])
    else:
        # 'penalty' parameter isn't relevant for this solver,
        # so we always specify 'l2' as the dummy value.
        penalty = "l2"

    classifier = LogisticRegression(max_iter=200,
                                    solver=solver,
                                    C=C,
                                    penalty=penalty)

    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    classifier.fit(X_train, y_train)

    score = classifier.score(X_valid, y_valid)
    return score
Esempio n. 3
0
def test_big(fit_intercept):
    X, y = make_classification(chunks=50)
    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
    if fit_intercept:
        assert lr.intercept_ is not None
Esempio n. 4
0
def test_fit_solver(solver):
    import dask_glm
    from distutils.version import LooseVersion

    if LooseVersion(dask_glm.__version__) <= "0.2.0":
        pytest.skip("FutureWarning for dask config.")

    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(solver=solver)
    lr.fit(X, y)
Esempio n. 5
0
def test_fit_solver(solver):
    import dask_glm
    import packaging.version

    if packaging.version.parse(
            dask_glm.__version__) <= packaging.version.parse("0.2.0"):
        pytest.skip("FutureWarning for dask config.")

    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(solver=solver)
    lr.fit(X, y)
Esempio n. 6
0
def simple_example():
	X, y = make_classification(n_samples=10000, n_features=2, chunks=50)

	X = dd.from_dask_array(X, columns=["a","b"])
	y = dd.from_array(y)

	lr = LogisticRegression()
	lr.fit(X.values, y.values)

	print('Predictions =', lr.predict(X.values).compute())
	print('Probabilities =', lr.predict_proba(X.values).compute())
	print('Scores =', lr.score(X.values, y.values).compute())
Esempio n. 7
0
def train(X_train, y_train, out_model):
    lr = LogisticRegression(penalty='l2',
                            solver='lbfgs',
                            n_jobs=64,
                            max_iter=10)
    # If leave just the dataframe, will throw an error saying "This estimator
    # does not support dask dataframes."
    lr.fit(X_train.values, y_train.values)

    # Saving model for later prediction
    pickle.dump(lr, open(out_model, "wb"))

    # Outputing some statistics
    y_train_pred = lr.predict(X_train.values)

    TN, FP, FN, TP = confusion_matrix_dask(y_train.values, y_train_pred)
    print("Read like \n[[TN, FP], \n[FN, TP]]\n", np.array([[TN, FP], [FN,
                                                                       TP]]))
Esempio n. 8
0
def test_dataframe_warns_about_chunks(fit_intercept):
    rng = np.random.RandomState(42)
    n, d = 20, 5
    kwargs = dict(npartitions=4)
    X = dd.from_pandas(pd.DataFrame(rng.uniform(size=(n, d))), **kwargs)
    y = dd.from_pandas(pd.Series(rng.choice(2, size=n)), **kwargs)
    clf = LogisticRegression(fit_intercept=fit_intercept)
    msg = "does not support dask dataframes.*might be resolved with"
    with pytest.raises(TypeError, match=msg):
        clf.fit(X, y)
    clf.fit(X.values, y.values)
    clf.fit(X.to_dask_array(), y.to_dask_array())
    clf.fit(X.to_dask_array(lengths=True), y.to_dask_array(lengths=True))
Esempio n. 9
0
def test_fit(fit_intercept, solver):
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(fit_intercept=fit_intercept)
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
Esempio n. 10
0
def test_logistic_predict_proba_shape():
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression()
    lr.fit(X, y)
    prob = lr.predict_proba(X)
    assert prob.shape == (100, 2)
import dask.dataframe as dd
import dask.datasets as ds
import time
from dask_ml.linear_model import LogisticRegression
from dask_glm.datasets import make_classification

X, y = make_classification(n_samples=1000)

t = time.time()
lr = LogisticRegression()
lr.fit(X, y)
lr.predict(X)
lr.predict_proba(X)
#est.score(X, y)
print('\nTime dask_ml: ' + str(time.time() - t))

# Parallelize Scikit-Learn Directly
from dask.distributed import Client
from sklearn.externals.joblib import parallel_backend

client = Client('localhost:8786')  # Connect to a Dask Cluster
print(client)
with parallel_backend('dask', scatter=[X, y]):
    # Your normal scikit-learn code here
    t = time.time()
    lr = LogisticRegression()
    lr.fit(X, y)
    lr.predict(X)
    lr.predict_proba(X)
    #est.score(X, y)
    print('\nTime dask_ml distributed: ' + str(time.time() - t))
Esempio n. 12
0
def test_fit_solver(solver):
    X, y = make_classification(n_samples=100, n_features=5, chunks=50)
    lr = LogisticRegression(solver=solver)
    lr.fit(X, y)
# Test-train split
from dask_ml.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(to_dask_array(X),
                                                    to_dask_array(y),
                                                    random_state=99)

###################################################################################

# Fitting the Logistic Regression Classifier
from dask_ml.linear_model import LogisticRegression

lr = LogisticRegression()

with ProgressBar():
    lr.fit(X_train, y_train)

print('Logistic Regression Score : ', lr.score(X_test, y_test).compute())
##### OUTPUT --------> Logistic Regression Score :  0.70025

#####################################################################################

# Fitting the Naive Bayes Classifier
from sklearn.naive_bayes import BernoulliNB
from dask_ml.wrappers import Incremental

nb = BernoulliNB()

parallel_nb = Incremental(nb)

with ProgressBar():