Esempio n. 1
0
def test_basic_fit_predict(client):

    X, y = load_text_corpus(client)

    model = MultinomialNB()

    model.fit(X, y)

    y_hat = model.predict(X)

    y_hat = y_hat.compute()
    y = y.compute()

    assert (accuracy_score(y_hat.get(), y) > .97)
Esempio n. 2
0
def post_etl_processing(client, train_data, test_data):
    import cudf
    from cuml.dask.naive_bayes import MultinomialNB as DistMNB
    from cuml.dask.common import to_dask_cudf
    from cuml.dask.common.input_utils import DistributedDataHandler

    # Feature engineering
    X_train = build_features(train_data)
    X_test = build_features(test_data)

    y_train = build_labels(train_data)
    y_test = build_labels(test_data)

    # Perform ML
    model = DistMNB(client=client, alpha=0.001)
    model.fit(X_train, y_train)

    ### this regression seems to be coming from here
    test_pred_st = time.time()
    y_hat = model.predict(X_test).persist()

    # Compute distributed performance metrics
    acc = accuracy_score(client, y_test, y_hat)

    print("Accuracy: " + str(acc))
    prec = precision_score(client, y_test, y_hat, average="macro")

    print("Precision: " + str(prec))
    cmat = confusion_matrix(client, y_test, y_hat)

    print("Confusion Matrix: " + str(cmat))
    metric_et = time.time()

    # Place results back in original Dataframe

    ddh = DistributedDataHandler.create(y_hat)
    test_preds = to_dask_cudf(
        [client.submit(cudf.Series, part) for w, part in ddh.gpu_futures])

    test_preds = test_preds.map_partitions(categoricalize)

    test_data["prediction"] = test_preds

    final_data = test_data[["pr_review_sk", "pr_review_rating",
                            "prediction"]].persist()

    final_data = final_data.sort_values("pr_review_sk").reset_index(drop=True)
    wait(final_data)
    return final_data, acc, prec, cmat
Esempio n. 3
0
def test_score(client):

    X, y = load_text_corpus(client)

    model = MultinomialNB()
    model.fit(X, y)

    y_hat = model.predict(X)

    score = model.score(X, y)

    y_hat_local = y_hat.compute()
    y_local = y.compute()

    assert (accuracy_score(y_hat_local.get(), y_local) == score)
Esempio n. 4
0
def test_single_distributed_exact_results(client):

    X, y = load_text_corpus(client)

    sgX, sgy = (X.compute(), y.compute())

    model = MultinomialNB()
    model.fit(X, y)

    sg_model = SGNB()
    sg_model.fit(sgX, sgy)

    y_hat = model.predict(X)
    sg_y_hat = sg_model.predict(sgX).get()

    y_hat = y_hat.compute().get()

    assert (accuracy_score(y_hat, sg_y_hat) == 1.0)