def testGLMGaussianScoringHistory(): col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_rmse", "validation_rmse", "training_mae", "validation_mae", "training_deviance", "validation_deviance"] h2o_data = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build gaussian model with score_each_interval to true model = glm(family="gaussian", score_each_iteration=True) model.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) # build gaussian model with score_iteration_interval to 1 model_score_each = glm(family="gaussian", score_iteration_interval=1) model_score_each.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assert_equal_scoring_history(model, model_score_each, col_list_compare) # build gaussian model with score_each_interval to true, with CV model_cv = glm(family="gaussian", score_each_iteration=True, nfolds=3, fold_assignment='modulo', seed=1234) model_cv.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) # build gaussian model with score_iteration_interval to 1, with CV model_score_each_cv = glm(family="gaussian", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234) model_score_each_cv.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assert_equal_scoring_history(model_cv, model_score_each_cv, col_list_compare) model_cv_4th = glm(family="gaussian", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234) model_cv_4th.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) pyunit_utils.assertEqualScoringHistoryIteration(model_cv_4th, model_cv, col_list_compare)
def testGLMBinomialScoringHistory(): col_list_compare = ["iterations", "objective", "negative_log_likelihood", "training_logloss", "validation_logloss", "training_classification_error", "validation_classification_error", "training_rmse", "validation_rmse", "training_auc", "validation_auc", "training_pr_auc", "validation_pr_auc", "training_lift", "validation_lift"] h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) for ind in range(10): h2o_data[ind] = h2o_data[ind].asfactor() h2o_data["C21"] = h2o_data["C21"].asfactor() splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234) train = splits_frames[0] valid = splits_frames[1] Y = "C21" X = list(range(0,20)) print("Building model with score_interval=1. Should generate same model as " "score_each_iteration turned on.") h2o_model = glm(family="binomial", score_iteration_interval=1) h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid) print("Building model with score_each_iteration turned on.") h2o_model_score_each = glm(family="binomial", score_each_iteration=True) h2o_model_score_each.train(x=X, y=Y, training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each and h2o_model should be the same pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model, col_list_compare) print("Building model with score_each_iteration turned on, with CV.") h2o_model_score_each_cv = glm(family="binomial", score_each_iteration=True, nfolds=3, fold_assignment='modulo', seed=1234) h2o_model_score_each_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) print("Building model with score_interval=1, and CV. Should generate same model as score_each_iteration turned " "on, with lambda search and CV.") h2o_model_cv = glm(family="binomial", score_iteration_interval=1, nfolds=3, fold_assignment='modulo', seed=1234) h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each_cv and h2o_model_cv should be the same pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv, h2o_model_cv, col_list_compare) # check if scoring_interval is set to 4, the output should be the same for every fourth iteration h2o_model_cv_4th = glm(family="binomial", score_iteration_interval=4, nfolds=3, fold_assignment='modulo', seed=1234) h2o_model_cv_4th.train(x=X, y=Y, training_frame=train, validation_frame=valid) pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv, h2o_model_cv_4th, col_list_compare)
def test_glm_scoring_history_multinomial(): col_list_compare = [ "iterations", "training_logloss", "validation_logloss", "training_classification_error", "validation_classification_error", "deviance_train", "deviance_test" ] print("Preparing dataset....") h2o_data = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_data["C3"] = h2o_data["C3"].asfactor() h2o_data["C4"] = h2o_data["C4"].asfactor() h2o_data["C5"] = h2o_data["C5"].asfactor() h2o_data["C11"] = h2o_data["C11"].asfactor() splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234) train = splits_frames[0] valid = splits_frames[1] print( "Building model with score_each_iteration turned on, with lambda search." ) h2o_model_score_each = glm(family="multinomial", score_each_iteration=True, lambda_search=True, nlambdas=10, generate_scoring_history=True) h2o_model_score_each.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) print( "Building model with score_interval=1. Should generate same model as score_each_iteration turned on." ) h2o_model = glm(family="multinomial", score_iteration_interval=1, lambda_search=True, nlambdas=10, generate_scoring_history=True) h2o_model.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model, col_list_compare) col_list_compare.append("deviance_xval") col_list_compare.append("deviance_se") print( "Building model with score_each_iteration turned on, with lambda search and CV." ) h2o_model_score_each_cv = glm(family="multinomial", score_each_iteration=True, lambda_search=True, nlambdas=10, nfolds=2, fold_assignment='modulo', generate_scoring_history=True) h2o_model_score_each_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) print( "Building model with score_interval=1. Should generate same model as score_each_iteration turned on, with " "lambda search and CV.") h2o_model_cv = glm(family="multinomial", score_iteration_interval=1, lambda_search=True, nlambdas=10, nfolds=2, fold_assignment='modulo', generate_scoring_history=True) h2o_model_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv, h2o_model_cv, col_list_compare) h2o_model_4th_cv = glm(family="multinomial", score_iteration_interval=4, lambda_search=True, nlambdas=10, nfolds=2, fold_assignment='modulo', generate_scoring_history=True) h2o_model_4th_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv, h2o_model_4th_cv, col_list_compare) print("Done")
def test_glm_scoring_history_multinomial(): col_list_compare = [ "iterations", "objective", "negative_log_likelihood", "training_logloss", "validation_logloss", "training_classification_error", "validation_classification_error", "deviance_train", "deviance_test" ] print("Preparing dataset....") h2o_data = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_data["C3"] = h2o_data["C3"].asfactor() h2o_data["C4"] = h2o_data["C4"].asfactor() h2o_data["C5"] = h2o_data["C5"].asfactor() h2o_data["C11"] = h2o_data["C11"].asfactor() splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234) train = splits_frames[0] valid = splits_frames[1] print("Building model with score_each_iteration turned on.") h2o_model_score_each = glm(family="multinomial", score_each_iteration=True, generate_scoring_history=True) h2o_model_score_each.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) print( "Building model with score_interval=1. Should generate same model as score_each_iteration turned on." ) h2o_model = glm(family="multinomial", score_iteration_interval=1, generate_scoring_history=True) h2o_model.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each and h2o_model should be the same pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model, col_list_compare) print( "Building model with score_each_iteration turned on and cross-validaton on." ) h2o_model_score_each_cv = glm(family="multinomial", score_each_iteration=True, nfolds=2, seed=1234, fold_assignment="modulo", generate_scoring_history=True) h2o_model_score_each_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) print( "Building model with score_interval=1 and cross-validation on. Should generate same model as " "score_each_iteration and cv turned on.") h2o_model_cv = glm(family="multinomial", score_iteration_interval=1, nfolds=2, fold_assignment="modulo", seed=1234, generate_scoring_history=True) h2o_model_cv.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each_cv and h2o_model_cv should be the same pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv, h2o_model_cv, col_list_compare) # check if scoring_interval is set to 4, the output should be the same for every fourth iteration print( "Building model with score_interval=4 and cross-validation on. Should generate same model as " "other models and same scoring history at the correct iteration.") h2o_model_cv_4th = glm(family="multinomial", score_iteration_interval=3, nfolds=2, fold_assignment="modulo", seed=1234, generate_scoring_history=True) h2o_model_cv_4th.train(x=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], y="C11", training_frame=train, validation_frame=valid) pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv, h2o_model_cv_4th, col_list_compare)
def testGLMBinomialScoringHistoryLambdaSearch(): col_list_compare = [ "iterations", "training_logloss", "validation_logloss", "training_classification_error", "validation_classification_error", "training_rmse", "validation_rmse", "training_auc", "validation_auc", "training_pr_auc", "validation_pr_auc", "training_lift", "validation_lift", "deviance_train", "deviance_test" ] h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/binomial_20_cols_10KRows.csv")) for ind in range(10): h2o_data[ind] = h2o_data[ind].asfactor() h2o_data["C21"] = h2o_data["C21"].asfactor() splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234) train = splits_frames[0] valid = splits_frames[1] Y = "C21" X = list(range(0, 20)) print( "Building model with score_interval=1 and lambda search on. Should generate same model as " "score_each_iteration turned on.") h2o_model = glm(family="binomial", score_iteration_interval=1, lambda_search=True, nlambdas=10, generate_scoring_history=True) h2o_model.train(x=X, y=Y, training_frame=train, validation_frame=valid) print( "Building model with score_each_iteration turned on, with lambda search." ) h2o_model_score_each = glm(family="binomial", score_each_iteration=True, lambda_search=True, nlambdas=10, generate_scoring_history=True) h2o_model_score_each.train(x=X, y=Y, training_frame=train, validation_frame=valid) # scoring history from h2o_model_score_each and h2o_model should be the same pyunit_utils.assert_equal_scoring_history(h2o_model_score_each, h2o_model, col_list_compare) print( "Building model with score_each_iteration turned on, with lambda search and CV." ) h2o_model_score_each_cv = glm(family="binomial", score_each_iteration=True, lambda_search=True, nlambdas=10, nfolds=2, fold_assignment='modulo', generate_scoring_history=True) h2o_model_score_each_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) print( "Building model with score_interval=1, lambda search on and CV. Should generate same model as " "score_each_iteration turned on, with lambda search and CV.") h2o_model_cv = glm(family="binomial", score_iteration_interval=1, lambda_search=True, nlambdas=10, nfolds=2, fold_assignment='modulo', generate_scoring_history=True) h2o_model_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) col_list_compare.append("deviance_xval") col_list_compare.append("deviance_se") pyunit_utils.assert_equal_scoring_history(h2o_model_score_each_cv, h2o_model_cv, col_list_compare) h2o_model_4th_cv = glm(family="binomial", score_iteration_interval=4, lambda_search=True, nlambdas=10, nfolds=2, fold_assignment='modulo', generate_scoring_history=True) h2o_model_4th_cv.train(x=X, y=Y, training_frame=train, validation_frame=valid) pyunit_utils.assertEqualScoringHistoryIteration(h2o_model_cv, h2o_model_4th_cv, col_list_compare)