Ejemplo n.º 1
0
def test_multiclass():
    X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4)
    X = da.from_array(X, chunks=50)
    y = da.from_array(y, chunks=50)

    if SK_GE_020:
        kwargs = {"multi_class": "auto"}
    else:
        kwargs = {}
    clf = ParallelPostFit(
        LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", **kwargs)
    )

    clf.fit(X, y)
    result = clf.predict(X)
    expected = clf.estimator.predict(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_proba(X)
    expected = clf.estimator.predict_proba(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)
Ejemplo n.º 2
0
def test_multiclass():
    X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4)
    X = da.from_array(X, chunks=50)
    y = da.from_array(y, chunks=50)

    clf = ParallelPostFit(
        LogisticRegression(random_state=0,
                           n_jobs=1,
                           solver="lbfgs",
                           multi_class="auto"))

    clf.fit(*dask.compute(X, y))
    result = clf.predict(X)
    expected = clf.estimator.predict(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_proba(X)
    expected = clf.estimator.predict_proba(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_log_proba(X)
    expected = clf.estimator.predict_log_proba(X)
    assert_eq_ar(result, expected)
Ejemplo n.º 3
0
def test_predict(kind):
    X, y = make_classification(chunks=100)

    if kind == "numpy":
        X, y = dask.compute(X, y)
    elif kind == "dask.dataframe":
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")
    wrap = ParallelPostFit(
        LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs"))

    base.fit(*dask.compute(X, y))
    wrap.fit(*dask.compute(X, y))

    assert_estimator_equal(wrap.estimator, base)

    result = wrap.predict(X)
    expected = base.predict(X)
    assert_eq_ar(result, expected)

    result = wrap.predict_proba(X)
    expected = base.predict_proba(X)
    assert_eq_ar(result, expected)

    result = wrap.predict_log_proba(X)
    expected = base.predict_log_proba(X)
    assert_eq_ar(result, expected)
Ejemplo n.º 4
0
def test_laziness():
    clf = ParallelPostFit(LinearRegression())
    X, y = make_classification(chunks=50)
    clf.fit(X, y)

    x = clf.score(X, y, compute=False)
    assert dask.is_dask_collection(x)
    assert 0 < x.compute() < 1
Ejemplo n.º 5
0
def test_it_works():
    clf = ParallelPostFit(GradientBoostingClassifier())

    X, y = make_classification(n_samples=1000, chunks=100)
    clf.fit(X, y)

    assert isinstance(clf.predict(X), da.Array)
    assert isinstance(clf.predict_proba(X), da.Array)
Ejemplo n.º 6
0
def test_no_method_raises():
    clf = ParallelPostFit(LinearRegression())
    X, y = make_classification(chunks=50)
    clf.fit(X, y)

    with pytest.raises(AttributeError) as m:
        clf.predict_proba(X)

    assert m.match("The wrapped estimator (.|\n)* 'predict_proba' method.")
Ejemplo n.º 7
0
def test_auto_rechunk():
    clf = ParallelPostFit(GradientBoostingClassifier())
    X, y = make_classification(n_samples=1000, n_features=20, chunks=100)
    X = X.rechunk({0: 100, 1: 10})
    clf.fit(X, y)

    assert clf.predict(X).compute().shape == (1000,)
    assert clf.predict_proba(X).compute().shape == (1000, 2)
    assert clf.score(X, y) == clf.score(X.compute(), y.compute())
Ejemplo n.º 8
0
def test_it_works():
    clf = ParallelPostFit(GradientBoostingClassifier())

    X, y = make_classification(n_samples=1000, chunks=100)
    X_, y_ = dask.compute(X, y)
    clf.fit(X_, y_)

    assert isinstance(clf.predict(X), da.Array)
    assert isinstance(clf.predict_proba(X), da.Array)

    result = clf.score(X, y)
    expected = clf.estimator.score(X_, y_)
    assert result == expected
Ejemplo n.º 9
0
def test_multiclass():
    X, y = make_classification(chunks=50, n_classes=3, n_informative=4)
    clf = ParallelPostFit(LogisticRegression(random_state=0))

    clf.fit(X, y)
    result = clf.predict(X)
    expected = clf.estimator.predict(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_proba(X)
    expected = clf.estimator.predict_proba(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)
Ejemplo n.º 10
0
    def train(self,
              X_train: np.ndarray,
              y_train: np.ndarray,
              X_test: np.ndarray,
              y_test: np.ndarray,
              verbose: bool = True,
              optimize: bool = False):
        X_train_prepared = self._preprocess_dataset(X_train)

        clf = ParallelPostFit(self.classifier, scoring='accuracy')
        self.classifier = clf.fit(X_train_prepared, y_train)

        X_test_prepared = self._preprocess_dataset(X_test)
        prediction = self.classifier.predict_proba(X_test_prepared)

        y_proba_list = [
            self._predict_proba_to_label(proba).value for proba in prediction
        ]

        if verbose:
            self.evaluate(y_proba_list,
                          y_test,
                          classes=self.classifier.classes_)

        if optimize:
            opt_classifier = ImageModelOptimiser(self).optimize(
                X_train, y_train)
            self.classifier = opt_classifier.classifier
Ejemplo n.º 11
0
def test_transform(kind):
    X, y = make_classification(chunks=100)

    if kind == "numpy":
        X, y = dask.compute(X, y)
    elif kind == "dask.dataframe":
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = PCA(random_state=0)
    wrap = ParallelPostFit(PCA(random_state=0))

    base.fit(*dask.compute(X, y))
    wrap.fit(*dask.compute(X, y))

    assert_estimator_equal(wrap.estimator, base)

    result = base.transform(*dask.compute(X))
    expected = wrap.transform(X)
    assert_eq_ar(result, expected)
Ejemplo n.º 12
0
def test_predict(kind):
    X, y = make_classification(chunks=100)

    if kind == 'numpy':
        X, y = dask.compute(X, y)
    elif kind == 'dask.dataframe':
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = LogisticRegression(random_state=0)
    wrap = ParallelPostFit(LogisticRegression(random_state=0))

    base.fit(X, y)
    wrap.fit(X, y)

    assert_estimator_equal(wrap.estimator, base)

    result = wrap.predict(X)
    expected = base.predict(X)
    assert_eq_ar(result, expected)

    result = wrap.predict_proba(X)
    expected = base.predict_proba(X)
    assert_eq_ar(result, expected)
Ejemplo n.º 13
0
def test_warning_on_dask_array_without_array_function():
    X, y = make_classification(n_samples=10, n_features=2, chunks=10)
    clf = ParallelPostFit(GradientBoostingClassifier())
    clf = clf.fit(X, y)

    class FakeArray:
        def __init__(self, value):
            self.value = value

        @property
        def ndim(self):
            return self.value.ndim

        @property
        def len(self):
            return self.value.len

        @property
        def dtype(self):
            return self.value.dtype

        @property
        def shape(self):
            return self.value.shape

    ar = FakeArray(np.zeros(shape=(2, 2)))
    fake_dask_ar = da.from_array(ar)
    fake_dask_ar._meta = FakeArray(np.zeros(shape=(0, 0)))

    with pytest.warns(
            UserWarning,
            match="provide explicit `predict_meta` to the dask_ml.wrapper"):
        clf.predict(fake_dask_ar)

    with pytest.warns(
            UserWarning,
            match=
            "provide explicit `predict_proba_meta` to the dask_ml.wrapper",
    ):
        clf.predict_proba(fake_dask_ar)
Ejemplo n.º 14
0
def train_model(x_train, y_train):
    clf = ParallelPostFit(estimator=GaussianNB(), scoring='accuracy')
    clf.fit(x_train, y_train)
    return clf
Ejemplo n.º 15
0
# Scale up: connect to your own cluster with bmore resources
# see http://dask.pydata.org/en/latest/setup.html
client = Client(processes=False, threads_per_worker=4,
                n_workers=1, memory_limit='2GB')

print(client)
dtype = {
        'total': np.float64,
        'temperature': np.int32,
        'humidity': np.float64,
        'solar': np.float64,
        'car_connected': np.int32,
        'car_energy': np.int32,
        'battery_energy': np.int32,
        'current_temperature': np.int32,
        'b': np.int32,
        'c': np.int32,
        'air': np.int32,
        'cost': np.int32
    }
x = pd.read_csv('train_data.csv', dtype=dtype)

y = x.pop('cost').values

mlp = ParallelPostFit(neural_network.MLPRegressor(hidden_layer_sizes=(16,), solver='adam'), scoring="r2")

print('Training')
mlp.fit(x, y)
print('Finished')

While only predict is demonstrated here, wrappers.ParallelPostFit is equally
useful for predict_proba and transform.
"""
from timeit import default_timer as tic

import pandas as pd
import seaborn as sns
import sklearn.datasets
from sklearn.svm import SVC

import dask_ml.datasets
from dask_ml.wrappers import ParallelPostFit

X, y = sklearn.datasets.make_classification(n_samples=1000)
clf = ParallelPostFit(SVC(gamma='scale'))
clf.fit(X, y)

Ns = [100_000, 200_000, 400_000, 800_000]
timings = []

for n in Ns:
    X, y = dask_ml.datasets.make_classification(n_samples=n,
                                                random_state=n,
                                                chunks=n // 20)
    t1 = tic()
    # Serial scikit-learn version
    clf.estimator.predict(X)
    timings.append(('Scikit-Learn', n, tic() - t1))

    t1 = tic()
    # Parallelized scikit-learn version
Ejemplo n.º 17
0
def test_sklearn():
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    from sklearn.linear_model import SGDClassifier, LogisticRegressionCV
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    from sklearn.externals import joblib
    from sklearn.datasets import make_classification, load_digits, fetch_20newsgroups

    from dask_ml.wrappers import ParallelPostFit

    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]

    print("Loading 20 newsgroups dataset for categories:")
    print(categories)

    data = fetch_20newsgroups(subset='train', categories=categories)
    print("%d documents" % len(data.filenames))
    print("%d categories" % len(data.target_names))
    print()

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(max_iter=1000)),
    ])

    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        # 'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        # 'tfidf__use_idf': (True, False),
        # 'tfidf__norm': ('l1', 'l2'),
        # 'clf__alpha': (0.00001, 0.000001),
        # 'clf__penalty': ('l2', 'elasticnet'),
        # 'clf__n_iter': (10, 50, 80),
    }

    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               cv=3,
                               refit=False,
                               iid=False)
    grid_search.fit(data.data, data.target)

    with joblib.parallel_backend('dask'):
        grid_search.fit(data.data, data.target)

    X, y = load_digits(return_X_y=True)
    svc = ParallelPostFit(SVC(random_state=0, gamma='scale'))

    param_grid = {
        # use estimator__param instead of param
        'estimator__C': [0.01, 1.0, 10],
    }

    grid_search = GridSearchCV(svc, param_grid, iid=False, cv=3)
    grid_search.fit(X, y)

    big_X = da.concatenate(
        [da.from_array(X, chunks=X.shape) for _ in range(10)])
    predicted = grid_search.predict(big_X)

    #
    X_train, y_train = make_classification(n_features=2,
                                           n_redundant=0,
                                           n_informative=2,
                                           random_state=1,
                                           n_clusters_per_class=1,
                                           n_samples=1000)

    N = 100
    X_large = da.concatenate(
        [da.from_array(X_train, chunks=X_train.shape) for _ in range(N)])
    y_large = da.concatenate(
        [da.from_array(y_train, chunks=y_train.shape) for _ in range(N)])
    clf = ParallelPostFit(LogisticRegressionCV(cv=3))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_large)
    clf.score(X_large, y_large)

    # est.partial_fit(X_train_1, y_train_1)

    # from tpot import TPOTClassifier
    pass