Beispiel #1
0
def test_labelencoder_unfitted(client):
    """ Try calling `.transform()` without fitting first
    """
    df = dask_cudf.from_cudf(cudf.Series(np.random.choice(10, (10, ))),
                             npartitions=len(client.has_what()))
    le = LabelEncoder()
    with pytest.raises(NotFittedError):
        le.transform(df).compute()
Beispiel #2
0
def test_unfitted_inverse_transform(client):
    """ Try calling `.inverse_transform()` without fitting first
    """
    tmp = cudf.Series(np.random.choice(10, (10, )))
    df = dask_cudf.from_cudf(tmp, npartitions=len(client.has_what()))
    le = LabelEncoder()

    with pytest.raises(NotFittedError):
        le.transform(df)
Beispiel #3
0
def test_inverse_transform(orig_label, ord_label, expected_reverted,
                           bad_ord_label, use_fit_transform, client):
    n_workers = len(client.has_what())
    orig_label = dask_cudf.from_cudf(orig_label, npartitions=n_workers)
    ord_label = dask_cudf.from_cudf(ord_label, npartitions=n_workers)
    expected_reverted = dask_cudf.from_cudf(expected_reverted,
                                            npartitions=n_workers)
    bad_ord_label = dask_cudf.from_cudf(bad_ord_label, npartitions=n_workers)

    # prepare LabelEncoder
    le = LabelEncoder()
    if use_fit_transform:
        le.fit_transform(orig_label)
    else:
        le.fit(orig_label)
    assert (le._fitted is True)

    # test if inverse_transform is correct
    reverted = le.inverse_transform(ord_label)
    reverted = reverted.compute().reset_index(drop=True)
    expected_reverted = expected_reverted.compute()

    assert (len(reverted) == len(expected_reverted))
    assert (len(reverted) == len(reverted[reverted == expected_reverted]))
    # test if correctly raies ValueError
    with pytest.raises(ValueError, match='y contains previously unseen label'):
        le.inverse_transform(bad_ord_label).compute()
Beispiel #4
0
def test_labelencoder_unseen(client):
    """ Try encoding a value that was not present during fitting
    """
    df = dask_cudf.from_cudf(cudf.Series(np.random.choice(10, (10, ))),
                             npartitions=len(client.has_what()))
    le = LabelEncoder().fit(df)
    assert le._fitted

    with pytest.raises(KeyError):
        tmp = dask_cudf.from_cudf(cudf.Series([-100, -120]),
                                  npartitions=len(client.has_what()))
        le.transform(tmp).compute()
Beispiel #5
0
def test_labelencoder_transform(length, cardinality, client):
    """ Try fitting and then encoding a small subset of the df
    """
    tmp = cudf.Series(np.random.choice(cardinality, (length, )))
    df = dask_cudf.from_cudf(tmp, npartitions=len(client.has_what()))
    le = LabelEncoder().fit(df)
    assert le._fitted

    encoded = le.transform(df)

    df_arr = df.compute().to_numpy()
    df_arr = _arr_to_similarity_mat(df_arr)
    encoder_arr = cp.asnumpy(encoded.compute().to_numpy())
    encoded_arr = _arr_to_similarity_mat(encoder_arr)
    assert ((encoded_arr == encoded_arr.T) == (df_arr == df_arr.T)).all()
Beispiel #6
0
def test_masked_encode(client):
    n_workers = len(client.has_what())
    df = cudf.DataFrame({
        "filter_col": [1, 1, 2, 3, 1, 1, 1, 1, 6, 5],
        "cat_col": ['a', 'b', 'c', 'd', 'a', 'a', 'a', 'c', 'b', 'c']
    })
    ddf = dask_cudf.from_cudf(df, npartitions=n_workers)

    ddf_filter = ddf[ddf["filter_col"] == 1]
    filter_encoded = LabelEncoder().fit_transform(ddf_filter["cat_col"])
    ddf_filter = ddf_filter.assign(filter_encoded=filter_encoded.values)

    encoded_filter = LabelEncoder().fit_transform(ddf["cat_col"])
    ddf = ddf.assign(encoded_filter=encoded_filter.values)

    ddf = ddf[ddf.filter_col == 1]

    assert (ddf.encoded_filter == ddf_filter.filter_encoded).compute().all()
Beispiel #7
0
def test_empty_input(empty, ord_label, client):
    # prepare LabelEncoder
    n_workers = len(client.has_what())
    empty = dask_cudf.from_cudf(empty, npartitions=n_workers)
    ord_label = dask_cudf.from_cudf(ord_label, npartitions=n_workers)
    le = LabelEncoder()
    le.fit(empty)
    assert (le._fitted is True)

    # test if correctly raies ValueError
    with pytest.raises(ValueError, match='y contains previously unseen label'):
        le.inverse_transform(ord_label).compute()

    # check fit_transform()
    le = LabelEncoder()
    transformed = le.fit_transform(empty).compute()
    assert (le._fitted is True)
    assert (len(transformed) == 0)