def test_maxrglm_validation(): d = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) my_y = "C21" my_x = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20" ] factor_x = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for x in factor_x: d[x] = d[x].asfactor() frames = d.split_frame(ratios=[0.8], seed=12345) train = frames[0] test = frames[1] maxrglm_model = maxrglm(seed=12345, max_predictor_number=3) maxrglm_model.train(training_frame=train, x=my_x, y=my_y) best_r2_value = maxrglm_model.get_best_R2_values() best_predictor_names = maxrglm_model.get_best_model_predictors() maxrglm_model_v = maxrglm(seed=12345, max_predictor_number=3) maxrglm_model_v.train(training_frame=train, validation_frame=test, x=my_x, y=my_y) best_r2_value_v = maxrglm_model_v.get_best_R2_values() best_predictor_names_v = maxrglm_model.get_best_model_predictors() # R2 values are different between the two models numSet = len(best_r2_value) for index in range(numSet): best_predictor = best_predictor_names[index] best_predictor_v = best_predictor_names_v[index] best_r2 = best_r2_value[index] best_r2_v = best_r2_value_v[index] if best_predictor == best_predictor_v: assert not (best_r2 == best_r2_v), "R2 values should not equal"
def test_maxrglm_cross_validation_result_frame_model_id(): d = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) my_y = "C21" my_x = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20" ] factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for x in factorX: d[x] = d[x].asfactor() n_folds = 3 maxrglm_model = maxrglm(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="auto") maxrglm_model.train(training_frame=d, x=my_x, y=my_y) resultFrame = maxrglm_model.result() numRows = resultFrame.nrows modelIDs = maxrglm_model._model_json["output"]["best_model_ids"] for ind in list(range(numRows)): model_frame = h2o.get_model(resultFrame["model_id"][ind, 0]) pred_frame = model_frame.predict(d) model_id = h2o.get_model(modelIDs[ind]['name']) pred_id = model_id.predict(d) pyunit_utils.compare_frames_local(pred_frame, pred_id, prob=1)
def test_maxrglm_gaussian(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxrglm_model = maxrglm(seed=12345, max_predictor_number=7) maxrglm_model.train(training_frame=d, x=my_x, y=my_y) resultFrame = maxrglm_model.result() numRows = resultFrame.nrows best_r2_value = maxrglm_model.get_best_R2_values() for ind in list(range(numRows)): # r2 from attributes best_r2 = best_r2_value[ind] one_model = h2o.get_model(resultFrame["model_id"][ind, 0]) pred = one_model.predict(d) print("last element of predictor frame: {0}".format( pred[pred.nrows - 1, pred.ncols - 1])) assert pred.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: {1}".format( pred.nrows, d.nrows) # r2 from result frame frame_r2 = resultFrame["best_r2_value"][ind, 0] # r2 from model model_r2 = one_model.r2() # make sure all r2 are equal assert abs( best_r2 - frame_r2 ) < 1e-6, "expected best r2: {0}, actual best r2: {1}".format( best_r2, frame_r2) assert abs( frame_r2 - model_r2 ) < 1e-6, "expected best r2: {0}, actual best r2: {1}".format( model_r2, frame_r2)
def test_maxrglm_gaussian_coefs(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxrglm_model = maxrglm(seed=12345, max_predictor_number=7) maxrglm_model.train(training_frame=d, x=my_x, y=my_y) coefs = maxrglm_model.coef() coefs_norm = maxrglm_model.coef_norm() for ind in list(range(len(coefs))): one_coef = coefs[ind] one_coef_norm = coefs_norm[ind] # coefficients obtained from accessing model_id, generate model and access the model coeffs one_model = h2o.get_model( maxrglm_model._model_json["output"]["best_model_ids"][ind]['name']) model_coef = one_model.coef() model_coef_norm = one_model.coef_norm() # get coefficients of individual predictor subset size subset_size = ind + 1 one_model_coef = maxrglm_model.coef(subset_size) one_model_coef_norm = maxrglm_model.coef_norm(subset_size) # check coefficient dicts are equal pyunit_utils.assertCoefDictEqual(one_coef, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_coef_norm, model_coef_norm, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6) pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm, 1e-6)
def test_maxrglm_gaussian(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxrglm_model = maxrglm(seed=12345, max_predictor_number=3) maxrglm_model.train(training_frame=d, x=my_x, y=my_y) best_r2_value = maxrglm_model.get_best_R2_values() best_predictor_names = maxrglm_model.get_best_model_predictors() # assert that model returned with one predictor found by maxrglm is the best one_pred_r2 = [] for pred in my_x: x = [pred] m = glm(seed=12345) m.train(training_frame=d, x=x, y=my_y) one_pred_r2.append(m.r2()) best_r2 = max(one_pred_r2) assert abs(best_r2-best_r2_value[0]) < 1e-6, "expected best r2: {0}, actual best r2:{1}. They are different." \ "".format(best_r2_value[0], best_r2) print("Best one predictor model uses predictor: {0}".format( best_predictor_names[0])) my_x3 = [["AGE", "RACE", "CAPSULE"], ["AGE", "RACE", "DCAPS"], ["AGE", "RACE", "PSA"], ["AGE", "RACE", "VOL"], ["AGE", "RACE", "DPROS"], ["AGE", "CAPSULE", "DCAPS"], ["AGE", "CAPSULE", "PSA"], ["AGE", "CAPSULE", "VOL"], ["AGE", "CAPSULE", "DPROS"], ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "VOL"], ["AGE", "DCAPS", "DPROS"], ["AGE", "PSA", "VOL"], ["AGE", "PSA", "VOL"], ["AGE", "PSA", "DPROS"], ["AGE", "VOL", "DPROS"], ["RACE", "CAPSULE", "DCAPS"], ["RACE", "CAPSULE", "PSA"], ["RACE", "CAPSULE", "VOL"], ["RACE", "CAPSULE", "DPROS"], ["RACE", "DCAPS", "PSA"], ["RACE", "DCAPS", "VOL"], ["RACE", "DCAPS", "DPROS"], ["RACE", "PSA", "VOL"], ["RACE", "PSA", "DPROS"], ["RACE", "VOL", "DPROS"], ["CAPSULE", "DCAPS", "PSA"], ["CAPSULE", "DCAPS", "VOL"], ["CAPSULE", "DCAPS", "DPROS"], ["DCAPS", "PSA", "VOL"], ["DCAPS", "PSA", "DPROS"], ["DCAPS", "VOL", "DPROS"], ["PSA", "VOL", "DPROS"]] two_pred_r2 = [] for pred2 in my_x3: x = pred2 m = glm(seed=12345) m.train(training_frame=d, x=x, y=my_y) two_pred_r2.append(m.r2()) best_r2_two_pred = max(two_pred_r2) assert abs(best_r2_two_pred-best_r2_value[2]) < 1e-6, "expected best r2: {0}, actual best r2:{1}. They are different." \ "".format(best_r2_value[2], best_r2_two_pred) print("Best three predictors model uses predictors: {0}".format( best_predictor_names[2]))
def test_maxrglm_cross_validation(): d = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) my_y = "C21" my_x = [ "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20" ] factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for x in factorX: d[x] = d[x].asfactor() n_folds = 3 fold_numbers = d.modulo_kfold_column(n_folds=n_folds) fold_numbers.set_names(["fold_numbers_modulo"]) fold_numbers2 = d.kfold_column(n_folds=n_folds, seed=12345) fold_numbers2.set_names(["fold_numbers_kfold"]) # append the fold_numbers column to the cars dataset d = d.cbind(fold_numbers) d = d.cbind(fold_numbers2) # cv model with fold assignment maxrglm_model_fa = maxrglm(seed=12345, max_predictor_number=3, fold_column="fold_numbers_modulo") maxrglm_model_fa.train(training_frame=d, x=my_x, y=my_y) best_r2_value_fa = maxrglm_model_fa.get_best_R2_values() maxrglm_model_fk = maxrglm(seed=12345, max_predictor_number=3, fold_column="fold_numbers_kfold") maxrglm_model_fk.train(training_frame=d, x=my_x, y=my_y) best_r2_value_fk = maxrglm_model_fk.get_best_R2_values() # both models should provide same best R2 values pyunit_utils.equal_two_arrays(best_r2_value_fa, best_r2_value_fk, eps=1e-6)
def test_maxrglm_gaussian_model_id(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] maxrglm_model = maxrglm(seed=12345, max_predictor_number=7) maxrglm_model.train(training_frame=d, x=my_x, y=my_y) resultFrame = maxrglm_model.result() numRows = resultFrame.nrows modelIDs = maxrglm_model._model_json["output"]["best_model_ids"] for ind in list(range(numRows)): model_frame = h2o.get_model(resultFrame["model_id"][ind, 0]) pred_frame = model_frame.predict(d) model_id = h2o.get_model(modelIDs[ind]['name']) pred_id = model_id.predict(d) pyunit_utils.compare_frames_local(pred_frame, pred_id, prob=1)