def test_labelencoder_unfitted(client): """ Try calling `.transform()` without fitting first """ df = dask_cudf.from_cudf(cudf.Series(np.random.choice(10, (10, ))), npartitions=len(client.has_what())) le = LabelEncoder() with pytest.raises(NotFittedError): le.transform(df).compute()
def test_unfitted_inverse_transform(client): """ Try calling `.inverse_transform()` without fitting first """ tmp = cudf.Series(np.random.choice(10, (10, ))) df = dask_cudf.from_cudf(tmp, npartitions=len(client.has_what())) le = LabelEncoder() with pytest.raises(NotFittedError): le.transform(df)
def test_inverse_transform(orig_label, ord_label, expected_reverted, bad_ord_label, use_fit_transform, client): n_workers = len(client.has_what()) orig_label = dask_cudf.from_cudf(orig_label, npartitions=n_workers) ord_label = dask_cudf.from_cudf(ord_label, npartitions=n_workers) expected_reverted = dask_cudf.from_cudf(expected_reverted, npartitions=n_workers) bad_ord_label = dask_cudf.from_cudf(bad_ord_label, npartitions=n_workers) # prepare LabelEncoder le = LabelEncoder() if use_fit_transform: le.fit_transform(orig_label) else: le.fit(orig_label) assert (le._fitted is True) # test if inverse_transform is correct reverted = le.inverse_transform(ord_label) reverted = reverted.compute().reset_index(drop=True) expected_reverted = expected_reverted.compute() assert (len(reverted) == len(expected_reverted)) assert (len(reverted) == len(reverted[reverted == expected_reverted])) # test if correctly raies ValueError with pytest.raises(ValueError, match='y contains previously unseen label'): le.inverse_transform(bad_ord_label).compute()
def test_labelencoder_unseen(client): """ Try encoding a value that was not present during fitting """ df = dask_cudf.from_cudf(cudf.Series(np.random.choice(10, (10, ))), npartitions=len(client.has_what())) le = LabelEncoder().fit(df) assert le._fitted with pytest.raises(KeyError): tmp = dask_cudf.from_cudf(cudf.Series([-100, -120]), npartitions=len(client.has_what())) le.transform(tmp).compute()
def test_labelencoder_transform(length, cardinality, client): """ Try fitting and then encoding a small subset of the df """ tmp = cudf.Series(np.random.choice(cardinality, (length, ))) df = dask_cudf.from_cudf(tmp, npartitions=len(client.has_what())) le = LabelEncoder().fit(df) assert le._fitted encoded = le.transform(df) df_arr = df.compute().to_numpy() df_arr = _arr_to_similarity_mat(df_arr) encoder_arr = cp.asnumpy(encoded.compute().to_numpy()) encoded_arr = _arr_to_similarity_mat(encoder_arr) assert ((encoded_arr == encoded_arr.T) == (df_arr == df_arr.T)).all()
def test_masked_encode(client): n_workers = len(client.has_what()) df = cudf.DataFrame({ "filter_col": [1, 1, 2, 3, 1, 1, 1, 1, 6, 5], "cat_col": ['a', 'b', 'c', 'd', 'a', 'a', 'a', 'c', 'b', 'c'] }) ddf = dask_cudf.from_cudf(df, npartitions=n_workers) ddf_filter = ddf[ddf["filter_col"] == 1] filter_encoded = LabelEncoder().fit_transform(ddf_filter["cat_col"]) ddf_filter = ddf_filter.assign(filter_encoded=filter_encoded.values) encoded_filter = LabelEncoder().fit_transform(ddf["cat_col"]) ddf = ddf.assign(encoded_filter=encoded_filter.values) ddf = ddf[ddf.filter_col == 1] assert (ddf.encoded_filter == ddf_filter.filter_encoded).compute().all()
def test_empty_input(empty, ord_label, client): # prepare LabelEncoder n_workers = len(client.has_what()) empty = dask_cudf.from_cudf(empty, npartitions=n_workers) ord_label = dask_cudf.from_cudf(ord_label, npartitions=n_workers) le = LabelEncoder() le.fit(empty) assert (le._fitted is True) # test if correctly raies ValueError with pytest.raises(ValueError, match='y contains previously unseen label'): le.inverse_transform(ord_label).compute() # check fit_transform() le = LabelEncoder() transformed = le.fit_transform(empty).compute() assert (le._fitted is True) assert (len(transformed) == 0)