def test_basic_fit_predict(client): X, y = load_text_corpus(client) model = MultinomialNB() model.fit(X, y) y_hat = model.predict(X) y_hat = y_hat.compute() y = y.compute() assert (accuracy_score(y_hat.get(), y) > .97)
def post_etl_processing(client, train_data, test_data): import cudf from cuml.dask.naive_bayes import MultinomialNB as DistMNB from cuml.dask.common import to_dask_cudf from cuml.dask.common.input_utils import DistributedDataHandler # Feature engineering X_train = build_features(train_data) X_test = build_features(test_data) y_train = build_labels(train_data) y_test = build_labels(test_data) # Perform ML model = DistMNB(client=client, alpha=0.001) model.fit(X_train, y_train) ### this regression seems to be coming from here test_pred_st = time.time() y_hat = model.predict(X_test).persist() # Compute distributed performance metrics acc = accuracy_score(client, y_test, y_hat) print("Accuracy: " + str(acc)) prec = precision_score(client, y_test, y_hat, average="macro") print("Precision: " + str(prec)) cmat = confusion_matrix(client, y_test, y_hat) print("Confusion Matrix: " + str(cmat)) metric_et = time.time() # Place results back in original Dataframe ddh = DistributedDataHandler.create(y_hat) test_preds = to_dask_cudf( [client.submit(cudf.Series, part) for w, part in ddh.gpu_futures]) test_preds = test_preds.map_partitions(categoricalize) test_data["prediction"] = test_preds final_data = test_data[["pr_review_sk", "pr_review_rating", "prediction"]].persist() final_data = final_data.sort_values("pr_review_sk").reset_index(drop=True) wait(final_data) return final_data, acc, prec, cmat
def test_model_multiple_chunks(client, dtype): # tests naive_bayes with n_chunks being greater than one, related to issue # https://github.com/rapidsai/cuml/issues/3150 X = cp.array([[0, 0, 0, 1], [1, 0, 0, 1], [1, 0, 0, 0]]) X = dask.array.from_array(X, chunks=((1, 1, 1), -1)).astype(dtype) y = dask.array.from_array([1, 0, 0], asarray=False, fancy=False, chunks=(1)).astype(cp.int32) model = MultinomialNB() model.fit(X, y) # this test is a code coverage test, it is too small to be a numeric test, # but we call score here to exercise the whole model. assert(0 <= model.score(X, y) <= 1)
def test_single_distributed_exact_results(client): X, y = load_text_corpus(client) sgX, sgy = (X.compute(), y.compute()) model = MultinomialNB() model.fit(X, y) sg_model = SGNB() sg_model.fit(sgX, sgy) y_hat = model.predict(X) sg_y_hat = sg_model.predict(sgX).get() y_hat = y_hat.compute().get() assert (accuracy_score(y_hat, sg_y_hat) == 1.0)
def test_score(client): X, y = load_text_corpus(client) model = MultinomialNB() model.fit(X, y) y_hat = model.predict(X) score = model.score(X, y) y_hat_local = y_hat.compute() y_local = y.compute() assert (accuracy_score(y_hat_local.get(), y_local) == score)