Esempio n. 1
0
def test_scoring_string(scheduler, xy_classification, scoring):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring)
        assert callable(check_scoring(clf, scoring=scoring))
        clf.fit(X, y, classes=np.unique(y))
        clf.score(X, y)
Esempio n. 2
0
def test_scoring_string(scheduler, xy_classification, scoring):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(), scoring=scoring)
        if scoring:
            make_scorer(clf.scoring) == dask_ml.metrics.scorer.SCORERS[scoring]
        clf.fit(X, y, classes=np.unique(y))
        clf.score(X, y)
        clf.estimator.score(X, y)
Esempio n. 3
0
def test_scoring_string(scheduler, xy_classification, scoring):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        clf = Incremental(SGDClassifier(tol=1e-3), scoring=scoring)
        if scoring:
            assert (dask_ml.metrics.scorer.SCORERS[scoring] == check_scoring(
                clf, scoring=scoring))
        assert callable(check_scoring(clf, scoring=scoring))
        clf.fit(X, y, classes=np.unique(y))
        clf.score(X, y)
def test_replace_scoring(estimator, fit_kwargs, scoring, xy_classification, mocker):
    X, y = xy_classification
    inc = Incremental(estimator(max_iter=1000, random_state=0, tol=1e-3))
    inc.fit(X, y, **fit_kwargs)

    patch = mocker.patch.object(dask_ml.wrappers, "get_scorer")
    with patch:
        inc.score(X, y)

    assert patch.call_count == 1
    patch.assert_called_with(scoring, compute=True)
Esempio n. 5
0
def test_score_ndarrays():
    X = np.ones((10, 5))
    y = np.ones(10)

    sgd = SGDClassifier(tol=1e-3)
    inc = Incremental(sgd, scoring="accuracy")

    inc.partial_fit(X, y, classes=[0, 1])
    inc.fit(X, y, classes=[0, 1])

    assert inc.score(X, y) == 1

    dX = da.from_array(X, chunks=(2, 5))
    dy = da.from_array(y, chunks=2)
    assert inc.score(dX, dy) == 1
Esempio n. 6
0
def test_incremental_basic(scheduler, xy_classification):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        est1 = SGDClassifier(random_state=0, tol=1e-3)
        est2 = clone(est1)

        clf = Incremental(est1)
        result = clf.fit(X, y, classes=[0, 1])
        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])

        assert result is clf

        assert isinstance(result.estimator.coef_, np.ndarray)
        np.testing.assert_array_almost_equal(result.estimator.coef_,
                                             est2.coef_)

        assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        assert_eq(result, expected)

        # score
        result = clf.score(X, y)
        expected = est2.score(X, y)
        # assert isinstance(result, da.Array)
        assert_eq(result, expected)

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
        clf.partial_fit(X, y, classes=[0, 1])
        assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])
Esempio n. 7
0
def run3():
    client = Client()
    from dask_ml.datasets import make_classification
    df = dd.read_csv("isHealthTrain.csv",
                     assume_missing=True,
                     sample=640000000,
                     blocksize="10MB")
    df = df.fillna(0).fillna(0)
    for column in df.columns:
        if '.' in column:
            df = df.drop(column, axis=1)
    # for column in droppedColumns:
    #     df = df.drop(column, axis=1)
    y_train = df['acquired']
    X_train = df.drop('acquired', axis=1)
    df2 = dd.read_csv("isHealthTest.csv",
                      assume_missing=True,
                      sample=640000000,
                      blocksize="10MB")
    df2 = df2.fillna(0).fillna(0)
    for column in df2.columns:
        if '.' in column:
            df2 = df2.drop(column, axis=1)
    # for column in droppedColumns:
    #     df = df.drop(column, axis=1)
    y_test = df2['acquired']
    X_test = df2.drop('acquired', axis=1)
    # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train)
    x_test_tickers = X_test['ticker'].values.compute()
    x_test_dates = X_test['date'].values.compute()
    print(x_test_tickers[0])
    np.savetxt("x_test_tickers.csv", x_test_tickers, delimiter=",", fmt='%s')
    np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s')
    print("GOOD")
    for column in X_train.columns:
        if 'ticker' in column or 'date' in column:
            X_train = X_train.drop(column, axis=1)
            X_test = X_test.drop(column, axis=1)
    X_train = X_train.to_dask_array()
    X_test = X_test.values.compute()
    y_train = y_train.to_dask_array()
    y_test = y_test.values.compute()
    np.savetxt("y_test.csv", y_test, delimiter=",")
    from dask_ml.wrappers import Incremental
    from sklearn.linear_model import SGDClassifier
    from sklearn.neural_network import MLPClassifier
    from dask_ml.wrappers import ParallelPostFit

    est = MLPClassifier(solver='adam', activation='relu', random_state=0)
    print(est)
    inc = Incremental(est, scoring='f1')
    print("WORKING")
    for _ in range(10):
        inc.partial_fit(X_train, y_train, classes=[0, 1])
        print("FITTED")
        np.savetxt("predictions.csv", inc.predict_proba(X_test))
        print('Score:', inc.score(X_test, y_test))
Esempio n. 8
0
def test_incremental_basic(scheduler, dataframes):
    # Create observations that we know linear models can recover
    n, d = 100, 3
    rng = da.random.RandomState(42)
    X = rng.normal(size=(n, d), chunks=30)
    coef_star = rng.uniform(size=d, chunks=d)
    y = da.sign(X.dot(coef_star))
    y = (y + 1) / 2
    if dataframes:
        X = dd.from_array(X)
        y = dd.from_array(y)

    with scheduler() as (s, [_, _]):
        est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
        est2 = clone(est1)

        clf = Incremental(est1, random_state=0)
        result = clf.fit(X, y, classes=[0, 1])
        assert result is clf

        # est2 is a sklearn optimizer; this is just a benchmark
        if dataframes:
            X = X.to_dask_array(lengths=True)
            y = y.to_dask_array(lengths=True)

        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_].compute(),
                             y[slice_[0]].compute(),
                             classes=[0, 1])

        assert isinstance(result.estimator_.coef_, np.ndarray)
        rel_error = np.linalg.norm(clf.coef_ - est2.coef_)
        rel_error /= np.linalg.norm(clf.coef_)
        assert rel_error < 0.9

        assert set(dir(clf.estimator_)) == set(dir(est2))

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        if dataframes:
            # Compute is needed because chunk sizes of this array are unknown
            result = result.compute()
        rel_error = np.linalg.norm(result - expected)
        rel_error /= np.linalg.norm(expected)
        assert rel_error < 0.3

        # score
        result = clf.score(X, y)
        expected = est2.score(*dask.compute(X, y))
        assert abs(result - expected) < 0.1

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3,
                                        average=True))
        clf.partial_fit(X, y, classes=[0, 1])
        assert set(dir(clf.estimator_)) == set(dir(est2))
Esempio n. 9
0
def test_score(xy_classification):
    distributed = pytest.importorskip("distributed")
    client = distributed.Client(n_workers=2)

    X, y = xy_classification
    inc = Incremental(SGDClassifier(max_iter=1000, random_state=0), scoring="accuracy")

    with client:
        inc.fit(X, y, classes=[0, 1])
        result = inc.score(X, y)
        expected = inc.estimator_.score(X, y)

    assert result == expected
Esempio n. 10
0
def test_incremental_basic(scheduler):
    # Create observations that we know linear models can recover
    n, d = 100, 3
    rng = da.random.RandomState(42)
    X = rng.normal(size=(n, d), chunks=30)
    coef_star = rng.uniform(size=d, chunks=d)
    y = da.sign(X.dot(coef_star))
    y = (y + 1) / 2

    with scheduler() as (s, [_, _]):
        est1 = SGDClassifier(random_state=0, tol=1e-3, average=True)
        est2 = clone(est1)

        clf = Incremental(est1, random_state=0)
        result = clf.fit(X, y, classes=[0, 1])
        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])

        assert result is clf

        assert isinstance(result.estimator_.coef_, np.ndarray)
        rel_error = np.linalg.norm(clf.coef_ - est2.coef_)
        rel_error /= np.linalg.norm(clf.coef_)
        assert rel_error < 0.9

        assert set(dir(clf.estimator_)) == set(dir(est2))

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        rel_error = np.linalg.norm(result - expected)
        rel_error /= np.linalg.norm(expected)
        assert rel_error < 0.2

        # score
        result = clf.score(X, y)
        expected = est2.score(X, y)
        assert abs(result - expected) < 0.1

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3,
                                        average=True))
        clf.partial_fit(X, y, classes=[0, 1])
        assert set(dir(clf.estimator_)) == set(dir(est2))
Esempio n. 11
0
def run():
    client = Client()
    from dask_ml.datasets import make_classification
    df = dd.read_csv("isHealth.csv",
                     assume_missing=True,
                     sample=640000000,
                     blocksize="10MB")
    df = df.fillna(0).fillna(0)
    for column in df.columns:
        if '.' in column:
            df = df.drop(column, axis=1)
    # for column in droppedColumns:
    #     df = df.drop(column, axis=1)
    y = df['acquired']
    X = df.drop('acquired', axis=1)
    from dask_ml.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
    # X_train,X_train2,y_train,y_train2 = train_test_split(X_train,y_train)
    x_test_tickers = X_test['ticker'].values.compute()
    x_test_dates = X_test['date'].values.compute()
    print(x_test_tickers[0])
    np.savetxt("x_test_tickers.csv", [x_test_tickers, x_test_dates],
               delimiter=",",
               fmt='%s')
    np.savetxt("x_test_dates.csv", x_test_dates, delimiter=",", fmt='%s')
    print("GOOD")
    for column in X_train.columns:
        if 'ticker' in column or 'date' in column:
            X_train = X_train.drop(column, axis=1)
            X_test = X_test.drop(column, axis=1)
    X_train = X_train.to_dask_array()
    X_test = X_test.values.compute()
    y_train = y_train.to_dask_array()
    y_test = y_test.values.compute()
    np.savetxt("y_test.csv", y_test, delimiter=",")
    from dask_ml.wrappers import Incremental
    from sklearn.linear_model import SGDClassifier
    from sklearn.neural_network import MLPClassifier
    from dask_ml.wrappers import ParallelPostFit

    est = MLPClassifier(solver='adam', activation='relu', random_state=0)
    inc = Incremental(est, scoring='neg_log_loss')
    print("WORKING")
    for _ in range(10):
        inc.partial_fit(X_train, y_train, classes=[0, 1])
        print("FITTED")
        np.savetxt("predictions.csv", inc.predict_proba(X_test))
        print('Score:', inc.score(X_test, y_test))

    # model = MLPClassifier(solver='sgd', hidden_layer_sizes=(10,2),random_state=1)
    params = {'alpha': np.logspace(-2, 1, num=1000)}
    from dask_ml.model_selection import IncrementalSearchCV
    search = IncrementalSearchCV(est,
                                 params,
                                 n_initial_parameters=100,
                                 patience=20,
                                 max_iter=100)
    search.fit(X_train, y_train, classes=[0, 1])
    print(search)
    print("SCORE")
    print("FITTED")
    np.savetxt("predictions.csv", inc.predict_proba(X_test))
    print('Score:', inc.score(X_test, y_test))
Esempio n. 12
0
features = FEATURES_ARRAY

# [OUTPUT_FOLDER + 'lbp' + FORMAT]: #
# for feature in features:
for feature in [OUTPUT_FOLDER + 'lbp' + FORMAT]:  # features:
    print("""
    ----------------------------------
    getting feature: {}
    """.format(feature))
    X = np.load(feature, allow_pickle=True)
    X = to_arr(X)
    np.save('lbp_arr', X)
    X = da.from_array(X, chunks=X.shape)

    X = transformer_pipe.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    classes = da.unique(y_train).compute()

    "One model for all"
    inc = Incremental(SVC(random_state=RANDOM_STATE), scoring='accuracy')
    for _ in range(10):
        inc.partial_fit(X_train, y_train, classes=classes)
        print('Score:', inc.score(X_test, y_test))

    score = inc.score(X_test, y_test)
    print(score)

    np.save('lbp_svm', score)

time_it()
##### OUTPUT --------> Logistic Regression Score :  0.70025

#####################################################################################

# Fitting the Naive Bayes Classifier
from sklearn.naive_bayes import BernoulliNB
from dask_ml.wrappers import Incremental

nb = BernoulliNB()

parallel_nb = Incremental(nb)

with ProgressBar():
    parallel_nb.fit(X_train, y_train, classes=np.unique(y_train.compute()))

print('\n\nNaive Bayes Classifier Score : ', parallel_nb.score(X_test, y_test))
##### OUTPUT --------> Naive Bayes Classifier Score :  0.65

######################################################################################

# Performing GridSearch on the Logistic Regression Classifier
from dask_ml.model_selection import GridSearchCV

parameters = {'penalty': ['l1', 'l2'], 'C': [0.5, 1, 2]}

lr = LogisticRegression()

tuned_lr = GridSearchCV(lr, parameters)

with ProgressBar():
    tuned_lr.fit(X_train, y_train)