Example #1
0
def test_auto_rechunk():
    clf = ParallelPostFit(GradientBoostingClassifier())
    X, y = make_classification(n_samples=1000, n_features=20, chunks=100)
    X = X.rechunk({0: 100, 1: 10})
    clf.fit(X, y)

    assert clf.predict(X).compute().shape == (1000, )
    assert clf.predict_proba(X).compute().shape == (1000, 2)
    assert clf.score(X, y) == clf.score(X.compute(), y.compute())

    X, y = make_classification(n_samples=1000, n_features=20, chunks=100)
    X = X.rechunk({0: 100, 1: 10})
    X._chunks = (tuple(np.nan for _ in X.chunks[0]), X.chunks[1])
    clf.predict(X)
Example #2
0
def test_predict(kind):
    X, y = make_classification(chunks=100)

    if kind == "numpy":
        X, y = dask.compute(X, y)
    elif kind == "dask.dataframe":
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")
    wrap = ParallelPostFit(
        LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs"))

    base.fit(*dask.compute(X, y))
    wrap.fit(*dask.compute(X, y))

    assert_estimator_equal(wrap.estimator, base)

    result = wrap.predict(X)
    expected = base.predict(X)
    assert_eq_ar(result, expected)

    result = wrap.predict_proba(X)
    expected = base.predict_proba(X)
    assert_eq_ar(result, expected)

    result = wrap.predict_log_proba(X)
    expected = base.predict_log_proba(X)
    assert_eq_ar(result, expected)
Example #3
0
def test_multiclass():
    X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4)
    X = da.from_array(X, chunks=50)
    y = da.from_array(y, chunks=50)

    clf = ParallelPostFit(
        LogisticRegression(random_state=0,
                           n_jobs=1,
                           solver="lbfgs",
                           multi_class="auto"))

    clf.fit(*dask.compute(X, y))
    result = clf.predict(X)
    expected = clf.estimator.predict(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_proba(X)
    expected = clf.estimator.predict_proba(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_log_proba(X)
    expected = clf.estimator.predict_log_proba(X)
    assert_eq_ar(result, expected)
def test_multiclass():
    X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4)
    X = da.from_array(X, chunks=50)
    y = da.from_array(y, chunks=50)

    if SK_GE_020:
        kwargs = {"multi_class": "auto"}
    else:
        kwargs = {}
    clf = ParallelPostFit(
        LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", **kwargs)
    )

    clf.fit(X, y)
    result = clf.predict(X)
    expected = clf.estimator.predict(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_proba(X)
    expected = clf.estimator.predict_proba(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)
Example #5
0
def test_regressor_sg_train_mg_predict(datatype, keys, data_size,
                                       fit_intercept, client):

    # Just testing for basic compatibility w/ dask-ml's ParallelPostFit.
    # Refer to test_pickle.py for more extensive testing of single-GPU
    # model serialization.

    nrows, ncols, n_info = data_size
    X_train, y_train, _ = make_dataset(datatype, nrows, ncols, n_info)

    X_train_local = X_train.compute()
    y_train_local = y_train.compute()

    local_model = cuml.linear_model.LinearRegression(
        fit_intercept=fit_intercept)
    local_model.fit(X_train_local, y_train_local)

    dist_model = ParallelPostFit(estimator=local_model)

    predictions = dist_model.predict(X_train).compute()

    assert isinstance(predictions, cupy.ndarray)

    # Dataset should be fairly linear already so the predictions should
    # be very close to the training data.
    np.testing.assert_allclose(predictions.get(),
                               y_train.compute().get(),
                               atol=1e-3,
                               rtol=1e-3)
def test_it_works():
    clf = ParallelPostFit(GradientBoostingClassifier())

    X, y = make_classification(n_samples=1000, chunks=100)
    clf.fit(X, y)

    assert isinstance(clf.predict(X), da.Array)
    assert isinstance(clf.predict_proba(X), da.Array)
Example #7
0
def test_predict_meta_override():
    X = pd.DataFrame({"c_0": [1, 2, 3, 4]})
    y = np.array([1, 2, 3, 4])

    base = CategoricalNB()
    base.fit(pd.DataFrame(X), y)

    dd_X = dd.from_pandas(X, npartitions=2)
    dd_X._meta = pd.DataFrame({"c_0": [5]})

    # Failure when not proving predict_meta
    # because of value dependent model
    wrap = ParallelPostFit(base)
    with pytest.raises(ValueError):
        wrap.predict(dd_X)

    # Success when providing meta over-ride
    wrap = ParallelPostFit(base, predict_meta=np.array([1]))
    result = wrap.predict(dd_X)
    expected = base.predict(X)
    assert_eq_ar(result, expected)
Example #8
0
def test_warning_on_dask_array_without_array_function():
    X, y = make_classification(n_samples=10, n_features=2, chunks=10)
    clf = ParallelPostFit(GradientBoostingClassifier())
    clf = clf.fit(X, y)

    class FakeArray:
        def __init__(self, value):
            self.value = value

        @property
        def ndim(self):
            return self.value.ndim

        @property
        def len(self):
            return self.value.len

        @property
        def dtype(self):
            return self.value.dtype

        @property
        def shape(self):
            return self.value.shape

    ar = FakeArray(np.zeros(shape=(2, 2)))
    fake_dask_ar = da.from_array(ar)
    fake_dask_ar._meta = FakeArray(np.zeros(shape=(0, 0)))

    with pytest.warns(
            UserWarning,
            match="provide explicit `predict_meta` to the dask_ml.wrapper"):
        clf.predict(fake_dask_ar)

    with pytest.warns(
            UserWarning,
            match=
            "provide explicit `predict_proba_meta` to the dask_ml.wrapper",
    ):
        clf.predict_proba(fake_dask_ar)
Example #9
0
def test_it_works():
    clf = ParallelPostFit(GradientBoostingClassifier())

    X, y = make_classification(n_samples=1000, chunks=100)
    X_, y_ = dask.compute(X, y)
    clf.fit(X_, y_)

    assert isinstance(clf.predict(X), da.Array)
    assert isinstance(clf.predict_proba(X), da.Array)

    result = clf.score(X, y)
    expected = clf.estimator.score(X_, y_)
    assert result == expected
Example #10
0
def test_predict_correct_output_dtype():
    X, y = make_classification(chunks=100)
    X_ddf = dd.from_dask_array(X)

    base = LinearRegression(n_jobs=1)
    base.fit(X, y)

    wrap = ParallelPostFit(base)

    base_output = base.predict(X_ddf.compute())
    wrap_output = wrap.predict(X_ddf)

    assert wrap_output.dtype == base_output.dtype
Example #11
0
def test_sparse_inputs():
    X = csr_matrix((3, 4))
    y = np.asarray([0, 0, 1], dtype=np.int32)

    base = SGDClassifier(tol=1e-3)
    base = base.fit(X, y)

    wrap = ParallelPostFit(base)
    X_da = da.from_array(X, chunks=(1, 4))

    result = wrap.predict(X_da).compute()
    expected = base.predict(X)

    assert_eq_ar(result, expected)
Example #12
0
def test_multiclass():
    X, y = make_classification(chunks=50, n_classes=3, n_informative=4)
    clf = ParallelPostFit(LogisticRegression(random_state=0))

    clf.fit(X, y)
    result = clf.predict(X)
    expected = clf.estimator.predict(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)

    result = clf.predict_proba(X)
    expected = clf.estimator.predict_proba(X)

    assert isinstance(result, da.Array)
    assert_eq_ar(result, expected)
def test_predict(kind):
    X, y = make_classification(chunks=100)

    if kind == 'numpy':
        X, y = dask.compute(X, y)
    elif kind == 'dask.dataframe':
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = LogisticRegression(random_state=0)
    wrap = ParallelPostFit(LogisticRegression(random_state=0))

    base.fit(X, y)
    wrap.fit(X, y)

    assert_estimator_equal(wrap.estimator, base)

    result = wrap.predict(X)
    expected = base.predict(X)
    assert_eq_ar(result, expected)

    result = wrap.predict_proba(X)
    expected = base.predict_proba(X)
    assert_eq_ar(result, expected)
Example #14
0
def predict_xr(
    model,
    input_xr,
    chunk_size=None,
    persist=True,
    proba=False,
    clean=False,
    return_input=False,
):
    """
    Using dask-ml ParallelPostfit(), runs  the parallel
    predict and predict_proba methods of sklearn
    estimators. Useful for running predictions
    on a larger-than-RAM datasets.

    Last modified: September 2020

    Parameters
    ----------
    model : scikit-learn model or compatible object
        Must have a .predict() method that takes numpy arrays.
    input_xr : xarray.DataArray or xarray.Dataset.
        Must have dimensions 'x' and 'y'
    chunk_size : int
        The dask chunk size to use on the flattened array. If this
        is left as None, then the chunks size is inferred from the
        .chunks() method on the `input_xr`
    persist : bool
        If True, and proba=True, then 'input_xr' data will be
        loaded into distributed memory. This will ensure data
        is not loaded twice for the prediction of probabilities,
        but this will only work if the data is not larger than RAM.
    proba : bool
        If True, predict probabilities. This only applies if the
        model has a .predict_proba() method
    clean : bool
        If True, remove Infs and NaNs from input and output arrays
    return_input : bool
        If True, then the data variables in the 'input_xr' dataset will
        be appended to the output xarray dataset.

    Returns
    ----------
    output_xr : xarray.Dataset
        An xarray.Dataset containing the prediction output from model
        with input_xr as input, if proba=True then dataset will also contain
        the prediciton probabilities. Has the same spatiotemporal structure
        as input_xr.

    """
    if chunk_size is None:
        chunk_size = int(input_xr.chunks["x"][0]) * int(
            input_xr.chunks["y"][0])

    # convert model to dask predict
    model = ParallelPostFit(model)

    # with joblib.parallel_backend("dask"):
    x, y, crs = input_xr.x, input_xr.y, input_xr.geobox.crs

    input_data = []

    for var_name in input_xr.data_vars:
        input_data.append(input_xr[var_name])

    input_data_flattened = []
    # TODO: transfer to dask dataframe
    for arr in input_data:
        data = arr.data.flatten().rechunk(chunk_size)
        input_data_flattened.append(data)

    # reshape for prediction
    input_data_flattened = da.array(input_data_flattened).transpose()

    if clean:
        input_data_flattened = da.where(da.isfinite(input_data_flattened),
                                        input_data_flattened, 0)

    if proba and persist:
        # persisting data so we don't require loading all the data twice
        input_data_flattened = input_data_flattened.persist()

    # apply the classification
    print("   predicting...")
    out_class = model.predict(input_data_flattened)

    # Mask out NaN or Inf values in results
    if clean:
        out_class = da.where(da.isfinite(out_class), out_class, 0)

    # Reshape when writing out
    out_class = out_class.reshape(len(y), len(x))

    # stack back into xarray
    output_xr = xr.DataArray(out_class,
                             coords={
                                 "x": x,
                                 "y": y
                             },
                             dims=["y", "x"])

    output_xr = output_xr.to_dataset(name="Predictions")

    if proba:
        print("   probabilities...")
        out_proba = model.predict_proba(input_data_flattened)

        # convert to %
        out_proba = da.max(out_proba, axis=1) * 100.0

        if clean:
            out_proba = da.where(da.isfinite(out_proba), out_proba, 0)

        out_proba = out_proba.reshape(len(y), len(x))

        out_proba = xr.DataArray(out_proba,
                                 coords={
                                     "x": x,
                                     "y": y
                                 },
                                 dims=["y", "x"])
        output_xr["Probabilities"] = out_proba

    if return_input:
        print("   input features...")
        # unflatten the input_data_flattened array and append
        # to the output_xr containin the predictions
        arr = input_xr.to_array()
        stacked = arr.stack(z=["y", "x"])
        # handle multivariable output
        output_px_shape = ()
        if len(input_data_flattened.shape[1:]):
            output_px_shape = input_data_flattened.shape[1:]

        output_features = input_data_flattened.reshape(
            (len(stacked.z), *output_px_shape))

        # set the stacked coordinate to match the input
        output_features = xr.DataArray(
            output_features,
            coords={
                "z": stacked["z"]
            },
            dims=[
                "z",
                *[
                    "output_dim_" + str(idx)
                    for idx in range(len(output_px_shape))
                ],
            ],
        ).unstack()

        # convert to dataset and rename arrays
        output_features = output_features.to_dataset(dim="output_dim_0")
        data_vars = list(input_xr.data_vars)
        output_features = output_features.rename(
            {i: j
             for i, j in zip(output_features.data_vars, data_vars)}  # noqa pylint: disable=unnecessary-comprehension
        )

        # merge with predictions
        output_xr = xr.merge([output_xr, output_features], compat="override")

    return assign_crs(output_xr, str(crs))
from dask_ml.wrappers import ParallelPostFit

X, y = sklearn.datasets.make_classification(n_samples=1000)
clf = ParallelPostFit(SVC(gamma='scale'))
clf.fit(X, y)

Ns = [100_000, 200_000, 400_000, 800_000]
timings = []

for n in Ns:
    X, y = dask_ml.datasets.make_classification(n_samples=n,
                                                random_state=n,
                                                chunks=n // 20)
    t1 = tic()
    # Serial scikit-learn version
    clf.estimator.predict(X)
    timings.append(('Scikit-Learn', n, tic() - t1))

    t1 = tic()
    # Parallelized scikit-learn version
    clf.predict(X).compute()
    timings.append(('dask-ml', n, tic() - t1))

df = pd.DataFrame(timings,
                  columns=['method', 'Number of Samples', 'Predict Time'])
ax = sns.factorplot(x='Number of Samples',
                    y='Predict Time',
                    hue='method',
                    data=df,
                    aspect=1.5)
import time
import datetime



# Start the computation.
n_samples_classification = 1000
n_samples = 100000000
chunks = n_samples//20

name = f'parallelizing_svm_800k_40k'

client = Client('10.255.23.115:8786')

X, y = sklearn.datasets.make_classification(n_samples=n_samples_classification)
clf = ParallelPostFit(SVC(gamma='scale'))
clf.fit(X, y)

X, y = dask_ml.datasets.make_classification(n_samples = n_samples,
                                            random_state = n_samples,
                                            chunks = chunks)

# Start the computation.
start = datetime.datetime.now()
results = clf.predict(X).compute(scheduler='distributed')
end = datetime.datetime.now()

print(f'Parallelizing svm is done in {end - start}')

clf.predict(X).visualize(filename=f'{name}.png')
Example #17
0
def test_sklearn():
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    from sklearn.linear_model import SGDClassifier, LogisticRegressionCV
    from sklearn.model_selection import GridSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    from sklearn.externals import joblib
    from sklearn.datasets import make_classification, load_digits, fetch_20newsgroups

    from dask_ml.wrappers import ParallelPostFit

    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]

    print("Loading 20 newsgroups dataset for categories:")
    print(categories)

    data = fetch_20newsgroups(subset='train', categories=categories)
    print("%d documents" % len(data.filenames))
    print("%d categories" % len(data.target_names))
    print()

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(max_iter=1000)),
    ])

    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        # 'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        # 'tfidf__use_idf': (True, False),
        # 'tfidf__norm': ('l1', 'l2'),
        # 'clf__alpha': (0.00001, 0.000001),
        # 'clf__penalty': ('l2', 'elasticnet'),
        # 'clf__n_iter': (10, 50, 80),
    }

    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               cv=3,
                               refit=False,
                               iid=False)
    grid_search.fit(data.data, data.target)

    with joblib.parallel_backend('dask'):
        grid_search.fit(data.data, data.target)

    X, y = load_digits(return_X_y=True)
    svc = ParallelPostFit(SVC(random_state=0, gamma='scale'))

    param_grid = {
        # use estimator__param instead of param
        'estimator__C': [0.01, 1.0, 10],
    }

    grid_search = GridSearchCV(svc, param_grid, iid=False, cv=3)
    grid_search.fit(X, y)

    big_X = da.concatenate(
        [da.from_array(X, chunks=X.shape) for _ in range(10)])
    predicted = grid_search.predict(big_X)

    #
    X_train, y_train = make_classification(n_features=2,
                                           n_redundant=0,
                                           n_informative=2,
                                           random_state=1,
                                           n_clusters_per_class=1,
                                           n_samples=1000)

    N = 100
    X_large = da.concatenate(
        [da.from_array(X_train, chunks=X_train.shape) for _ in range(N)])
    y_large = da.concatenate(
        [da.from_array(y_train, chunks=y_train.shape) for _ in range(N)])
    clf = ParallelPostFit(LogisticRegressionCV(cv=3))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_large)
    clf.score(X_large, y_large)

    # est.partial_fit(X_train_1, y_train_1)

    # from tpot import TPOTClassifier
    pass
Example #18
0
import numpy as np
import argparse
from joblib import load

parser = argparse.ArgumentParser()
parser.add_argument('--filename', type=str, dest='filename', help='path to the dataset to be scored')
args = parser.parse_args()

if __name__ == "__main__":
    from distributed import Client, LocalCluster
    from dask_ml.wrappers import ParallelPostFit
    import dask.dataframe as dd

    cluster = LocalCluster()
    client = Client(cluster)

    clf = load('wta-matches-model.joblib')
    clf = ParallelPostFit(clf)

    matches = dd.read_csv(args.filename, assume_missing=True)
    point_diff = (matches.winner_rank_points - matches.loser_rank_points).dropna()
    X_test = point_diff.compute().values[:, np.newaxis]

    y_test_pred = clf.predict(X_test)
    np.save("predictions.npy", y_test_pred)
Example #19
0
def function_2(file_name, rows_to_parse):
    data = pd.read_csv(os.path.join('data', 'nycflights', file_name +'.csv'),nrows=int(rows_to_parse)) #User
    data = data.fillna(0)
    print(data)
    
    train, test = train_test_split(data,train_size=0.5, test_size=0.5)
    
    train_x = train.drop(['DepDelay','UniqueCarrier','Origin','Dest'], axis=1)
    train_y = train['DepDelay']
    test_x = test.drop(['DepDelay','UniqueCarrier','Origin','Dest'], axis=1)
    test_y = test['DepDelay']
    
    # Support Vector Machines
    GridSearch.support_vector_machine(train_x,train_y,test_x,test_y)
    
    
        
    import time
    start_time = time.time()
    GridSearch.sklearn_grid_search(train_x, train_y)
    print("--- %s seconds ---" % (time.time() - start_time))
    
    
    
    #____DASK____
    
    
    
    c = dask.distributed.Client()
    client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB')
    print(client)
    
    
    import time
    start_time = time.time()
    GridSearch.dask_grid_search(train_x, train_y)
    print(f"--- {time.time() - start_time}seconds ---")
    
    
    #DASK DELAY
    
    
    
    
    output = []
    #for x in data:
    a = dask.delayed(GridSearch.support_vector_machine)(train_x,train_y,test_x,test_y)
    print(a)
    start_time = time.time()
    a.compute()
    print("--- %s seconds ---" % (time.time() - start_time))
    output.append(a)
    b = dask.delayed(GridSearch.sklearn_grid_search)(train_x, train_y)
    print(b)
    output.append(b)
    start_time = time.time()
    b.compute()
    print("--- %s seconds ---" % (time.time() - start_time))
    c = dask.delayed(GridSearch.dask_grid_search)(train_x, train_y)
    print(c)
    output.append(c)
    start_time = time.time()
    c.compute()
    print("--- %s seconds ---" % (time.time() - start_time))
    
    total = dask.delayed(sum)(output)
    #Visaualize
    total.visualize() 
    
    
    
    #Other Code:    
    clean_dataset(train_x)
    train_x = train_x.values
    train_x
    
    train_y = train_y.values
    train_y
    
    from sklearn.preprocessing import Normalizer
    x = train_x
    transformer = Normalizer().fit(x)
    
    transformer
    
    transformer.transform(x)
    
    train_x = transformer.transform(x)
    
    train_x = train_x.round(decimals=2)
    train_x
    
    train_x, train_y =  make_classification(
        n_features=2, n_redundant=0, n_informative=2,
        random_state=1, n_clusters_per_class=1, n_samples=1000)
    train_x[:5]
    
    train_y
    
    
    # Scale up: increase N, the number of times we replicate the data.
    N = 2
    X_large = da.concatenate([da.from_array(train_x, chunks=train_x.shape) for _ in range(N)])
    y_large = da.concatenate([da.from_array(train_y, chunks=train_y.shape) for _ in range(N)])
    print(X_large)
    
    clf = ParallelPostFit(LogisticRegressionCV(cv=3), scoring="r2")
    y_pred = clf.predict(X_large)
    print(y_pred)