def test_basic(self, daskify, values): de = dpp.DummyEncoder() df = dummy[['A', 'D']] if daskify: df = dd.from_pandas(df, 2) de = de.fit(df) trn = de.transform(df) expected = pd.DataFrame( { "D": np.array([1, 2, 3, 4]), "A_a": np.array([1, 0, 0, 1], dtype='uint8'), "A_b": np.array([0, 1, 0, 0], dtype='uint8'), "A_c": np.array([0, 0, 1, 0], dtype='uint8'), }, columns=['D', 'A_a', 'A_b', 'A_c']) assert_eq(trn, expected) if values: trn = trn.values result = de.inverse_transform(trn) if daskify: df = df.compute() result = result.compute() tm.assert_frame_equal(result, df)
def test_encode_subset_of_columns(self, daskify): de = dpp.DummyEncoder(columns=["B"]) df = dummy[["A", "B"]] if daskify: df = dd.from_pandas(df, 2) de = de.fit(df) trn = de.transform(df) expected = pd.DataFrame( { "A": pd.Categorical(["a", "b", "c", "a"], ordered=True), "B_a": np.array([1, 0, 0, 1], dtype="uint8"), "B_b": np.array([0, 1, 0, 0], dtype="uint8"), "B_c": np.array([0, 0, 1, 0], dtype="uint8"), }, columns=["A", "B_a", "B_b", "B_c"], ) assert_eq_df(trn, expected) result = de.inverse_transform(trn) if daskify: df = df.compute() result = result.compute() tm.assert_frame_equal(result, df)
def test_inverse_transform(self): de = dpp.DummyEncoder() df = dd.from_pandas( pd.DataFrame( {"A": np.arange(10), "B": pd.Categorical(["a"] * 4 + ["b"] * 6)} ), npartitions=2, ) de.fit(df) assert_eq_df(df, de.inverse_transform(de.transform(df))) assert_eq_df(df, de.inverse_transform(de.transform(df).values))
def test_drop_first(self, daskify): if daskify: df = dd.from_pandas(dummy, 2) else: df = dummy de = dpp.DummyEncoder(drop_first=True) trn = de.fit_transform(df) assert len(trn.columns) == 8 result = de.inverse_transform(trn) if daskify: result, df = compute(result, df) tm.assert_frame_equal(result, dummy)
def test_transform_raises(self): de = dpp.DummyEncoder() de.fit(dummy) with pytest.raises(ValueError) as rec: de.transform(dummy.drop("B", axis='columns')) assert rec.match("Columns of 'X' do not match the training")
def test_da(self): a = dd.from_pandas(dummy, npartitions=2) de = dpp.DummyEncoder() result = de.fit_transform(a) assert isinstance(result, dd.DataFrame)
def test_transform_explicit_columns(self): de = dpp.DummyEncoder(columns=["A", "B", "C"]) de.fit(dummy) with pytest.raises(ValueError) as rec: de.transform(dummy.drop("B", axis="columns")) assert rec.match("Columns of 'X' do not match the training")