def test_predict_proba(dataset, datatype, n_neighbors,
                       n_parts, batch_size, client):
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNClf(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_probas = l_model.predict_proba(X_test)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)

    d_model = dKNNClf(client=client, n_neighbors=n_neighbors)
    d_model.fit(X_train, y_train)
    d_probas = d_model.predict_proba(X_test, convert_dtype=True)
    d_probas = da.compute(d_probas)[0]

    if datatype == 'dask_cudf':
        d_probas = list(map(lambda o: o.as_matrix()
                            if isinstance(o, DataFrame)
                            else o.to_array()[..., np.newaxis],
                            d_probas))

    check_probabilities(l_probas, d_probas)
def test_predict_and_score(dataset, datatype, parameters, client):
    n_neighbors, n_parts, batch_size = parameters
    X_train, X_test, y_train, y_test = dataset
    np_y_test = y_test

    l_model = lKNNClf(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_distances, l_indices = l_model.kneighbors(X_test)
    l_labels = l_model.predict(X_test)
    local_out = (l_labels, l_indices, l_distances)
    handmade_local_score = np.mean(y_test == l_labels)
    handmade_local_score = round(handmade_local_score, 3)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)
    y_test = generate_dask_array(y_test, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)
        y_test = to_dask_cudf(y_test, client)

    d_model = dKNNClf(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_labels, d_indices, d_distances = \
        d_model.predict(X_test, convert_dtype=True)
    distributed_out = da.compute(d_labels, d_indices, d_distances)
    if datatype == 'dask_array':
        distributed_score = d_model.score(X_test, y_test)
        distributed_score = round(distributed_score, 3)

    if datatype == 'dask_cudf':
        distributed_out = list(
            map(
                lambda o: o.as_matrix()
                if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis],
                distributed_out))

    exact_match(local_out, distributed_out)

    if datatype == 'dask_array':
        assert distributed_score == pytest.approx(handmade_local_score,
                                                  abs=1e-2)
    else:
        y_pred = distributed_out[0]
        handmade_distributed_score = np.mean(np_y_test == y_pred)
        handmade_distributed_score = round(handmade_distributed_score, 3)
        assert handmade_distributed_score == pytest.approx(
            handmade_local_score, abs=1e-2)
def test_predict(dataset, datatype, n_neighbors, n_parts, batch_size, client):
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNClf(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_distances, l_indices = l_model.kneighbors(X_test)
    l_labels = l_model.predict(X_test)
    local_out = (l_labels, l_indices, l_distances)

    if not n_parts:
        n_parts = len(client.has_what().keys())

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)

    d_model = dKNNClf(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_labels, d_indices, d_distances = \
        d_model.predict(X_test, convert_dtype=True)
    distributed_out = da.compute(d_labels, d_indices, d_distances)

    if datatype == 'dask_cudf':
        distributed_out = list(
            map(
                lambda o: o.as_matrix()
                if isinstance(o, DataFrame) else o.to_array()[..., np.newaxis],
                distributed_out))

    match_test(local_out, distributed_out)
    assert accuracy_score(y_test, distributed_out[0]) > 0.12
Beispiel #4
0
def test_predict_and_score(dataset, datatype, parameters, client):
    n_neighbors, n_parts, batch_size = parameters
    X_train, X_test, y_train, y_test = dataset

    l_model = lKNNClf(n_neighbors=n_neighbors)
    l_model.fit(X_train, y_train)
    l_outputs = l_model.predict(X_test)
    handmade_local_score = np.mean(y_test == l_outputs)
    handmade_local_score = round(handmade_local_score, 3)

    X_train = generate_dask_array(X_train, n_parts)
    X_test = generate_dask_array(X_test, n_parts)
    y_train = generate_dask_array(y_train, n_parts)
    y_test = generate_dask_array(y_test, n_parts)

    if datatype == 'dask_cudf':
        X_train = to_dask_cudf(X_train, client)
        X_test = to_dask_cudf(X_test, client)
        y_train = to_dask_cudf(y_train, client)
        y_test = to_dask_cudf(y_test, client)

    d_model = dKNNClf(client=client,
                      n_neighbors=n_neighbors,
                      batch_size=batch_size)
    d_model.fit(X_train, y_train)
    d_outputs = d_model.predict(X_test, convert_dtype=True)
    d_outputs = d_outputs.compute()

    d_outputs = d_outputs.to_numpy() \
        if isinstance(d_outputs, DataFrame) \
        else d_outputs

    exact_match(l_outputs, d_outputs)

    distributed_score = d_model.score(X_test, y_test)
    distributed_score = round(distributed_score, 3)
    assert distributed_score == pytest.approx(handmade_local_score, abs=1e-2)