Example #1
0
def test_handle_zeros_in_scale():
    s2 = handle_zeros_in_scale(s)
    a2 = handle_zeros_in_scale(a)

    assert list(s2.compute()) == [1, 1, 2, 3, 1]
    assert list(a2.compute()) == [1, 1, 2, 3, 1]

    x = np.array([1, 2, 3, 0], dtype="f8")
    expected = np.array([1, 2, 3, 1], dtype="f8")
    result = handle_zeros_in_scale(x)
    np.testing.assert_array_equal(result, expected)

    x = pd.Series(x)
    expected = pd.Series(expected)
    result = handle_zeros_in_scale(x)
    tm.assert_series_equal(result, expected)

    x = da.from_array(x.values, chunks=2)
    expected = expected.values
    result = handle_zeros_in_scale(x)
    assert_eq_ar(result, expected)

    x = dd.from_dask_array(x)
    expected = pd.Series(expected)
    result = handle_zeros_in_scale(x)
    assert_eq_df(result, expected)
Example #2
0
    def test_basic(self, daskify, values):
        de = dpp.OrdinalEncoder()
        df = dummy[["A", "D"]]
        if daskify:
            df = dd.from_pandas(df, 2)
        de = de.fit(df)
        trn = de.transform(df)

        expected = pd.DataFrame(
            {
                "A": np.array([0, 1, 2, 0], dtype="int8"),
                "D": np.array([1, 2, 3, 4])
            },
            columns=["A", "D"],
        )

        assert_eq_df(trn, expected)

        if values:
            trn = trn.values

        result = de.inverse_transform(trn)

        if daskify:
            df = df.compute()
            result = result.compute()

        tm.assert_frame_equal(result, df)
Example #3
0
def test_to_dataframe_optimize_graph():
    pytest.importorskip("dask.dataframe")
    from dask.dataframe.utils import assert_eq as assert_eq_df

    x = db.from_sequence(
        [{"name": "test1", "v1": 1}, {"name": "test2", "v1": 2}], npartitions=2
    )

    # linear `map` tasks will be fused by graph optimization
    with dask.annotate(foo=True):
        y = x.map(lambda a: dict(**a, v2=a["v1"] + 1))
        y = y.map(lambda a: dict(**a, v3=a["v2"] + 1))
        y = y.map(lambda a: dict(**a, v4=a["v3"] + 1))

    # verifying the maps are not fused yet
    assert len(y.dask) == y.npartitions * 4

    # with optimizations
    d = y.to_dataframe()

    # All the `map` tasks have been fused
    assert len(d.dask) < len(y.dask)

    # no optimizations
    d2 = y.to_dataframe(optimize_graph=False)

    # Graph hasn't been fused. It contains all the original tasks,
    # plus one extra layer converting to DataFrame
    assert len(d2.dask) == len(y.dask) + d.npartitions

    # Annotations are still there
    assert hlg_layer_topological(d2.dask, 1).annotations == {"foo": True}

    assert_eq_df(d, d2)
Example #4
0
    def test_encode_subset_of_columns(self, daskify):
        de = dpp.DummyEncoder(columns=["B"])
        df = dummy[["A", "B"]]
        if daskify:
            df = dd.from_pandas(df, 2)
        de = de.fit(df)
        trn = de.transform(df)

        expected = pd.DataFrame(
            {
                "A": pd.Categorical(["a", "b", "c", "a"], ordered=True),
                "B_a": np.array([1, 0, 0, 1], dtype="uint8"),
                "B_b": np.array([0, 1, 0, 0], dtype="uint8"),
                "B_c": np.array([0, 0, 1, 0], dtype="uint8"),
            },
            columns=["A", "B_a", "B_b", "B_c"],
        )

        assert_eq_df(trn, expected)

        result = de.inverse_transform(trn)

        if daskify:
            df = df.compute()
            result = result.compute()

        tm.assert_frame_equal(result, df)
Example #5
0
def test_slice_columns():
    columns = [2, 3]
    df2 = slice_columns(df, columns)
    X2 = slice_columns(X, columns)

    assert list(df2.columns) == columns
    assert_eq_df(df[columns].compute(), df2.compute())
    assert_eq_ar(X.compute(), X2.compute())
Example #6
0
 def test_df_transform_index(self, daskify):
     frame = copy(df)
     if not daskify:
         frame = frame.compute()
     frame = frame.sample(frac=1.0)
     res_df = dpp.PolynomialFeatures(preserve_dataframe=True,
                                     degree=1).fit_transform(frame)
     assert_eq_df(res_df.iloc[:, 1:], frame, check_dtype=False)
Example #7
0
 def test_inverse_transform(self):
     de = dpp.DummyEncoder()
     df = dd.from_pandas(
         pd.DataFrame(
             {"A": np.arange(10), "B": pd.Categorical(["a"] * 4 + ["b"] * 6)}
         ),
         npartitions=2,
     )
     de.fit(df)
     assert_eq_df(df, de.inverse_transform(de.transform(df)))
     assert_eq_df(df, de.inverse_transform(de.transform(df).values))
Example #8
0
    def test_df_column_slice(self):
        mask = ["3", "4"]
        mask_ix = [mask.index(x) for x in mask]
        a = dpp.MinMaxScaler(columns=mask)
        b = spp.MinMaxScaler()

        dfa = a.fit_transform(df2).compute()
        mxb = b.fit_transform(df2.compute())

        assert isinstance(dfa, pd.DataFrame)
        assert_eq_ar(dfa[mask].values, mxb[:, mask_ix])
        assert_eq_df(dfa.drop(mask, axis=1), df2.drop(mask, axis=1).compute())
Example #9
0
def _assert_eq(l, r, **kwargs):
    array_types = (np.ndarray, da.Array)
    frame_types = (pd.core.generic.NDFrame, dd._Frame)
    if isinstance(l, array_types):
        assert_eq_ar(l, r, **kwargs)
    elif isinstance(l, frame_types):
        assert_eq_df(l, r, **kwargs)
    elif isinstance(l, Sequence) and any(
            isinstance(x, array_types + frame_types) for x in l):
        for a, b in zip(l, r):
            _assert_eq(a, b, **kwargs)
    else:
        assert l == r
Example #10
0
    def test_df_transform(self, daskify):
        frame = df
        if not daskify:
            frame = frame.compute()
        a = dpp.PolynomialFeatures(preserve_dataframe=True)
        b = dpp.PolynomialFeatures()
        c = spp.PolynomialFeatures()

        res_df = a.fit_transform(frame)
        res_arr = b.fit_transform(frame)
        res_c = c.fit_transform(frame)
        if daskify:
            res_pandas = a.fit_transform(frame.compute())
            assert dask.is_dask_collection(res_df)
            assert dask.is_dask_collection(res_arr)
            assert_eq_df(res_df.compute().reset_index(drop=True), res_pandas)
        assert_eq_ar(res_df.values, res_c)
        assert_eq_ar(res_df.values, res_arr)
Example #11
0
 def test_inverse_transform(self):
     enc = dpp.OrdinalEncoder()
     df = dd.from_pandas(pd.DataFrame({
         "A":
         np.arange(10),
         "B":
         pd.Categorical(['a'] * 4 + ['b'] * 6)
     }),
                         npartitions=2)
     enc.fit(df)
     assert_eq_df(df, enc.inverse_transform(enc.transform(df)))
     assert_eq_df(df, enc.inverse_transform(enc.transform(df).compute()))
     assert_eq_df(df, enc.inverse_transform(enc.transform(df).values))
     assert_eq_df(df,
                  enc.inverse_transform(enc.transform(df).values.compute()))
Example #12
0
    def test_inverse_transform(self):
        enc = dpp.OrdinalEncoder()
        df = dd.from_pandas(
            pd.DataFrame(
                {"A": np.arange(10), "B": pd.Categorical(["a"] * 4 + ["b"] * 6)}
            ),
            npartitions=2,
        )
        enc.fit(df)

        assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df).values))
        assert dask.is_dask_collection(enc.inverse_transform(enc.transform(df)))

        assert_eq_df(df, enc.inverse_transform(enc.transform(df)))
        assert_eq_df(df, enc.inverse_transform(enc.transform(df)))
        assert_eq_df(df, enc.inverse_transform(enc.transform(df).values))
        assert_eq_df(df, enc.inverse_transform(enc.transform(df).values))
Example #13
0
 def test_df_inverse_transform(self):
     mask = ["3", "4"]
     a = dpp.MinMaxScaler(columns=mask)
     assert_eq_df(
         a.inverse_transform(a.fit_transform(df2)).compute(), df2.compute())
Example #14
0
 def test_df_inverse_transform(self):
     mask = ["3", "4"]
     a = dpp.MinMaxScaler(columns=mask)
     result = a.inverse_transform(a.fit_transform(df2))
     assert dask.is_dask_colelction(result)
     assert_eq_df(result, df2)