Ejemplo n.º 1
0
    def test_basic(self):
        a = dpp.PolynomialFeatures()
        b = spp.PolynomialFeatures()

        a.fit(X)
        b.fit(X.compute())
        assert_estimator_equal(a._transformer, b)
Ejemplo n.º 2
0
    def test_basic(self):
        a = dpp.MinMaxScaler()
        b = spp.MinMaxScaler()

        a.fit(X)
        b.fit(X.compute())
        assert_estimator_equal(a, b, exclude='n_samples_seen_')
Ejemplo n.º 3
0
    def test_basic(self):
        a = dpp.StandardScaler()
        b = spp.StandardScaler()

        a.fit(X)
        b.fit(X.compute())
        assert_estimator_equal(a, b)
Ejemplo n.º 4
0
 def test_basic(self, single_chunk_classification):
     X, y = single_chunk_classification
     a = nn.ParitalMLPClassifier(classes=[0, 1], random_state=0)
     b = nn_.MLPClassifier(random_state=0)
     a.fit(X, y)
     b.partial_fit(X, y, classes=[0, 1])
     assert_estimator_equal(a, b)
Ejemplo n.º 5
0
 def test_basic(self, single_chunk_classification):
     X, y = single_chunk_classification
     a = nn.ParitalMLPRegressor(random_state=0)
     b = nn_.MLPRegressor(random_state=0)
     a.fit(X, y)
     b.partial_fit(X, y)
     assert_estimator_equal(a, b)
Ejemplo n.º 6
0
    def test_basic(self):
        a = dpp.LabelEncoder()
        b = spp.LabelEncoder()

        a.fit(y)
        b.fit(y.compute())
        assert_estimator_equal(a, b)
Ejemplo n.º 7
0
def test_hashing_vectorizer(container):
    b = db.from_sequence(JUNK_FOOD_DOCS, npartitions=2)
    if container == "series":
        b = b.to_dataframe(columns=["text"])["text"]
    elif container == "array":
        b = b.to_dataframe(columns=["text"])["text"].values

    vect_ref = sklearn.feature_extraction.text.HashingVectorizer()
    vect = dask_ml.feature_extraction.text.HashingVectorizer()

    X_ref = vect_ref.fit_transform(b.compute())
    X_da = vect.fit_transform(b)

    assert_estimator_equal(vect_ref, vect)

    assert isinstance(X_da, da.Array)
    assert isinstance(X_da.blocks[0].compute(), scipy.sparse.csr_matrix)

    result = X_da.map_blocks(lambda x: x.toarray(), dtype=X_da.dtype)
    expected = X_ref.toarray()
    # TODO: use dask.utils.assert_eq
    # Currently this fails chk_dask, as we end up with an integer key in the
    # dask graph.

    np.testing.assert_array_equal(result, expected)
Ejemplo n.º 8
0
    def test_basic(self, Xl_blobs_easy):
        X, _ = Xl_blobs_easy

        # make it super easy to cluster
        a = DKKMeans(n_clusters=3, random_state=0)
        b = SKKMeans(n_clusters=3, random_state=0)
        a.fit(X)
        b.fit(X)
        assert_estimator_equal(
            a,
            b,
            exclude=["n_iter_", "inertia_", "cluster_centers_", "labels_"])
        assert abs(a.inertia_ - b.inertia_) < 0.01
        # order is arbitrary, so align first
        a_order = np.argsort(a.cluster_centers_, 0)[:, 0]
        b_order = np.argsort(b.cluster_centers_, 0)[:, 0]
        a_centers = a.cluster_centers_[a_order]
        b_centers = b.cluster_centers_[b_order]
        np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3)
        b_labels = replace(b.labels_, [0, 1, 2],
                           a_order[b_order]).astype(b.labels_.dtype)
        assert_eq(a.labels_.compute(), b_labels)
        assert a.n_iter_
        # this is hacky
        b.cluster_centers_ = b_centers
        a.cluster_centers_ = a_centers
        assert_eq(a.transform(X), b.transform(X), rtol=1e-3)

        yhat_a = a.predict(X)
        yhat_b = b.predict(X)
        assert_eq(yhat_a.compute(), yhat_b)
Ejemplo n.º 9
0
 def test_basic(self, single_chunk_regression):
     X, y = single_chunk_regression
     a = lm.PartialPassiveAggressiveRegressor(random_state=0, max_iter=100, tol=1e-3)
     b = lm_.PassiveAggressiveRegressor(random_state=0, max_iter=100, tol=1e-3)
     a.fit(X, y)
     b.partial_fit(*dask.compute(X, y))
     assert_estimator_equal(a, b, exclude=["loss_function_"])
Ejemplo n.º 10
0
def test_predict(kind):
    X, y = make_classification(chunks=100)

    if kind == "numpy":
        X, y = dask.compute(X, y)
    elif kind == "dask.dataframe":
        X = dd.from_dask_array(X)
        y = dd.from_dask_array(y)

    base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")
    wrap = ParallelPostFit(
        LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs"))

    base.fit(*dask.compute(X, y))
    wrap.fit(*dask.compute(X, y))

    assert_estimator_equal(wrap.estimator, base)

    result = wrap.predict(X)
    expected = base.predict(X)
    assert_eq_ar(result, expected)

    result = wrap.predict_proba(X)
    expected = base.predict_proba(X)
    assert_eq_ar(result, expected)

    result = wrap.predict_log_proba(X)
    expected = base.predict_log_proba(X)
    assert_eq_ar(result, expected)
Ejemplo n.º 11
0
 def test_basic(self, single_chunk_blobs):
     X, y = single_chunk_blobs
     a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0)
     b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0)
     a.fit(X)
     b.partial_fit(X)
     assert_estimator_equal(a, b, exclude=['random_state_'])
Ejemplo n.º 12
0
    def test_basic(self):
        a = dpp.StandardScaler()
        b = spp.StandardScaler()

        a.fit(X)
        b.fit(X.compute())
        assert_estimator_equal(a, b, exclude="n_samples_seen_")
Ejemplo n.º 13
0
def test_incremental_basic(scheduler, xy_classification):
    X, y = xy_classification
    with scheduler() as (s, [a, b]):
        est1 = SGDClassifier(random_state=0, tol=1e-3)
        est2 = clone(est1)

        clf = Incremental(est1)
        result = clf.fit(X, y, classes=[0, 1])
        for slice_ in da.core.slices_from_chunks(X.chunks):
            est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1])

        assert result is clf

        assert isinstance(result.estimator.coef_, np.ndarray)
        np.testing.assert_array_almost_equal(result.estimator.coef_,
                                             est2.coef_)

        assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])

        #  Predict
        result = clf.predict(X)
        expected = est2.predict(X)
        assert isinstance(result, da.Array)
        assert_eq(result, expected)

        # score
        result = clf.score(X, y)
        expected = est2.score(X, y)
        # assert isinstance(result, da.Array)
        assert_eq(result, expected)

        clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
        clf.partial_fit(X, y, classes=[0, 1])
        assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])
Ejemplo n.º 14
0
def test_basic_dataframe(sparse, method, dask_data, dtype):
    a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)
    b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)

    if method == "fit":
        a.fit(df)
        b.fit(dask_data)
        expected = a.transform(df)
        result = b.transform(dask_data)
    else:
        expected = a.fit_transform(df)
        result = b.fit_transform(dask_data)

    assert_estimator_equal(
        a,
        b,
        exclude={
            "n_values_",
            "feature_indices_",
            "active_features_",
            "dtypes_",
            "drop_idx_",
        },
    )

    assert isinstance(result, type(dask_data))
    assert len(result.columns) == expected.shape[1]
    if sparse and PANDAS_VERSION >= packaging.version.parse("0.24.0"):
        # pandas sparse ExtensionDtype interface
        dtype = pd.SparseDtype(dtype, dtype(0))
    assert (result.dtypes == dtype).all()

    da.utils.assert_eq(result.values, expected)
Ejemplo n.º 15
0
def test_basic_array(sparse, method, categories):
    a = sklearn.preprocessing.OneHotEncoder(categories=categories, sparse=sparse)
    b = dask_ml.preprocessing.OneHotEncoder(categories=categories, sparse=sparse)

    if method == "fit":
        a.fit(X)
        b.fit(dX)
        expected = a.transform(X)
        result = b.transform(dX)
    else:
        expected = a.fit_transform(X)
        result = b.fit_transform(dX)

    assert_estimator_equal(
        a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"}
    )

    assert isinstance(result, da.Array)

    # can't use assert_eq since we're apparently making bad graphs
    # See TODO in `transform`.
    assert result.shape == expected.shape
    assert result.dtype == expected.dtype

    if sparse:
        assert scipy.sparse.issparse(result.blocks[0].compute())
        result = result.map_blocks(lambda x: x.toarray(), dtype="f8").compute()
        da.utils.assert_eq(result, expected)
    else:
        result = result.compute()
        da.utils.assert_eq(result, expected)
Ejemplo n.º 16
0
 def test_basic(self, single_chunk_classification):
     X, y = single_chunk_classification
     a = PartialPerceptron(classes=[0, 1], max_iter=1000, tol=1e-3)
     b = Perceptron(max_iter=1000, tol=1e-3)
     a.fit(X, y)
     b.partial_fit(X, y, classes=[0, 1])
     assert_estimator_equal(a.coef_, b.coef_)
Ejemplo n.º 17
0
def test_basic_dataframe(sparse, method, dask_data, dtype):
    a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)
    b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype)

    if method == "fit":
        a.fit(df)
        b.fit(dask_data)
        expected = a.transform(df)
        result = b.transform(dask_data)
    else:
        expected = a.fit_transform(df)
        result = b.fit_transform(dask_data)

    assert_estimator_equal(a,
                           b,
                           exclude={
                               "n_values_", "feature_indices_",
                               "active_features_", "dtypes_"
                           })

    assert isinstance(result, type(dask_data))
    assert len(result.columns) == expected.shape[1]
    if sparse:
        dtype = pd.SparseDtype(dtype, dtype(0))
    assert (result.dtypes == dtype).all()

    da.utils.assert_eq(result.values, expected)
Ejemplo n.º 18
0
def test_fit(data):
    a = sklearn.impute.SimpleImputer()
    b = dask_ml.impute.SimpleImputer()

    a.fit(X)
    b.fit(data)

    assert_estimator_equal(a, b)
Ejemplo n.º 19
0
    def test_basic(self):
        a = dpp.LabelEncoder()
        b = spp.LabelEncoder()

        a.fit(y)
        b.fit(y.compute())
        exclude = {"dtype_"}
        assert_estimator_equal(a, b, exclude=exclude)
Ejemplo n.º 20
0
    def test_basic(self, single_chunk_regression):
        X, y = single_chunk_regression
        a = lm.PartialSGDRegressor(random_state=0, max_iter=1000, tol=1e-3)
        b = lm_.SGDRegressor(random_state=0, max_iter=1000, tol=1e-3)

        a.fit(X, y)
        b.partial_fit(X, y)
        assert_estimator_equal(a, b)
Ejemplo n.º 21
0
    def test_array_transform(self):
        a = dpp.PolynomialFeatures()
        b = spp.PolynomialFeatures()

        res_a = a.fit_transform(X)
        res_b = b.fit_transform(X.compute())
        assert_estimator_equal(a, b)
        assert dask.is_dask_collection(res_a)
        assert_eq_ar(res_a, res_b)
Ejemplo n.º 22
0
def test_basic():
    a = dd.PCA()
    b = sd.PCA()
    a.fit(dX)
    b.fit(X)
    assert_estimator_equal(a, b, exclude=["components_"])
    np.testing.assert_allclose(
        flip_vector_signs(a.components_, 1), flip_vector_signs(b.components_, 1)
    )
Ejemplo n.º 23
0
def test_transforms_other():
    a = sklearn.feature_extraction.text.HashingVectorizer()
    b = dask_ml.feature_extraction.text.HashingVectorizer()

    X_a = a.fit_transform(JUNK_FOOD_DOCS)
    X_b = b.fit_transform(JUNK_FOOD_DOCS)
    assert_estimator_equal(a, b)

    np.testing.assert_array_equal(X_a.toarray(), X_b.toarray())
Ejemplo n.º 24
0
    def test_input_types(self):
        a = dpp.PolynomialFeatures()
        b = spp.PolynomialFeatures()

        assert_estimator_equal(a.fit(df), a.fit(df.compute()))
        assert_estimator_equal(a.fit(df), a.fit(df.compute().values))
        assert_estimator_equal(a.fit(df.values), a.fit(df.compute().values))
        assert_estimator_equal(a.fit(df), b.fit(df.compute()))
        assert_estimator_equal(a.fit(df), b.fit(df.compute().values))
Ejemplo n.º 25
0
def test_fit(data):
    a = sklearn.impute.SimpleImputer()
    b = dask_ml.impute.SimpleImputer()

    a.fit(X)
    b.fit(data)

    assert_estimator_equal(a, b, exclude=["statistics_"])
    np.testing.assert_array_almost_equal(a.statistics_, np.asarray(b.statistics_))
Ejemplo n.º 26
0
    def test_fit(self):
        a = dpp.RobustScaler()
        b = spp.RobustScaler()

        # bigger data to make percentile more reliable
        # and not centered around 0 to make rtol work
        X, y = make_classification(n_samples=1000, chunks=200, random_state=0)
        X = X + 3

        a.fit(X)
        b.fit(X.compute())
        assert_estimator_equal(a, b, rtol=0.2)
Ejemplo n.º 27
0
    def test_basic(self, single_chunk_classification):
        X, y = single_chunk_classification

        a = lm.PartialSGDClassifier(classes=[0, 1],
                                    random_state=0,
                                    max_iter=1000,
                                    tol=1e-3)
        b = lm_.SGDClassifier(random_state=0, max_iter=1000, tol=1e-3)

        a.fit(X, y)
        b.partial_fit(*dask.compute(X, y), classes=[0, 1])
        assert_estimator_equal(a, b, exclude=exclude)
Ejemplo n.º 28
0
 def test_basic(self, single_chunk_classification):
     X, y = single_chunk_classification
     a = lm.PartialPassiveAggressiveClassifier(classes=[0, 1],
                                               random_state=0,
                                               max_iter=100,
                                               tol=1e-3)
     b = lm_.PassiveAggressiveClassifier(random_state=0,
                                         max_iter=100,
                                         tol=1e-3)
     a.fit(X, y)
     b.partial_fit(X, y, classes=[0, 1])
     assert_estimator_equal(a, b, exclude=['loss_function_'])
Ejemplo n.º 29
0
def test_fit_convert(data):
    a = sklearn.impute.SimpleImputer()
    b = dask_ml.impute.SimpleImputer()

    expected = a.fit_transform(X.astype(str).astype(object))
    result = b.fit_transform(data.astype(str).astype(object))

    assert_estimator_equal(a, b)
    assert isinstance(result, type(data))
    if isinstance(data, (pd.DataFrame, dd.DataFrame)):
        result = result.values

    da.utils.assert_eq(result, expected)
Ejemplo n.º 30
0
def test_fit_constant(data):
    a = sklearn.impute.SimpleImputer(strategy="constant", fill_value=-999.0)
    b = dask_ml.impute.SimpleImputer(strategy="constant", fill_value=-999.0)

    expected = a.fit_transform(X)
    result = b.fit_transform(data)

    assert_estimator_equal(a, b)
    assert isinstance(result, type(data))
    if isinstance(data, (pd.DataFrame, dd.DataFrame)):
        result = result.values

    da.utils.assert_eq(result, expected)