def test_maxrglm_validation():
    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factor_x = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factor_x:
        d[x] = d[x].asfactor()
    frames = d.split_frame(ratios=[0.8], seed=12345)
    train = frames[0]
    test = frames[1]
    maxrglm_model = maxrglm(seed=12345, max_predictor_number=3)
    maxrglm_model.train(training_frame=train, x=my_x, y=my_y)
    best_r2_value = maxrglm_model.get_best_R2_values()
    best_predictor_names = maxrglm_model.get_best_model_predictors()
    maxrglm_model_v = maxrglm(seed=12345, max_predictor_number=3)
    maxrglm_model_v.train(training_frame=train,
                          validation_frame=test,
                          x=my_x,
                          y=my_y)
    best_r2_value_v = maxrglm_model_v.get_best_R2_values()
    best_predictor_names_v = maxrglm_model.get_best_model_predictors()

    # R2 values are different between the two models
    numSet = len(best_r2_value)
    for index in range(numSet):
        best_predictor = best_predictor_names[index]
        best_predictor_v = best_predictor_names_v[index]
        best_r2 = best_r2_value[index]
        best_r2_v = best_r2_value_v[index]
        if best_predictor == best_predictor_v:
            assert not (best_r2 == best_r2_v), "R2 values should not equal"
Example #2
0
def test_maxrglm_cross_validation_result_frame_model_id():

    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factorX:
        d[x] = d[x].asfactor()
    n_folds = 3

    maxrglm_model = maxrglm(seed=12345,
                            max_predictor_number=3,
                            nfolds=n_folds,
                            fold_assignment="auto")
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    resultFrame = maxrglm_model.result()
    numRows = resultFrame.nrows
    modelIDs = maxrglm_model._model_json["output"]["best_model_ids"]
    for ind in list(range(numRows)):
        model_frame = h2o.get_model(resultFrame["model_id"][ind, 0])
        pred_frame = model_frame.predict(d)
        model_id = h2o.get_model(modelIDs[ind]['name'])
        pred_id = model_id.predict(d)
        pyunit_utils.compare_frames_local(pred_frame, pred_id, prob=1)
Example #3
0
def test_maxrglm_gaussian():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    maxrglm_model = maxrglm(seed=12345, max_predictor_number=7)
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    resultFrame = maxrglm_model.result()
    numRows = resultFrame.nrows
    best_r2_value = maxrglm_model.get_best_R2_values()
    for ind in list(range(numRows)):
        # r2 from attributes
        best_r2 = best_r2_value[ind]
        one_model = h2o.get_model(resultFrame["model_id"][ind, 0])
        pred = one_model.predict(d)
        print("last element of predictor frame: {0}".format(
            pred[pred.nrows - 1, pred.ncols - 1]))
        assert pred.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: {1}".format(
            pred.nrows, d.nrows)
        # r2 from result frame
        frame_r2 = resultFrame["best_r2_value"][ind, 0]
        # r2 from model
        model_r2 = one_model.r2()
        # make sure all r2 are equal
        assert abs(
            best_r2 - frame_r2
        ) < 1e-6, "expected best r2: {0}, actual best r2: {1}".format(
            best_r2, frame_r2)
        assert abs(
            frame_r2 - model_r2
        ) < 1e-6, "expected best r2: {0}, actual best r2: {1}".format(
            model_r2, frame_r2)
def test_maxrglm_gaussian_coefs():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    maxrglm_model = maxrglm(seed=12345, max_predictor_number=7)
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    coefs = maxrglm_model.coef()
    coefs_norm = maxrglm_model.coef_norm()
    for ind in list(range(len(coefs))):
        one_coef = coefs[ind]
        one_coef_norm = coefs_norm[ind]
        # coefficients obtained from accessing model_id, generate model and access the model coeffs
        one_model = h2o.get_model(
            maxrglm_model._model_json["output"]["best_model_ids"][ind]['name'])
        model_coef = one_model.coef()
        model_coef_norm = one_model.coef_norm()
        # get coefficients of individual predictor subset size
        subset_size = ind + 1
        one_model_coef = maxrglm_model.coef(subset_size)
        one_model_coef_norm = maxrglm_model.coef_norm(subset_size)

        # check coefficient dicts are equal
        pyunit_utils.assertCoefDictEqual(one_coef, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_coef_norm, model_coef_norm, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm,
                                         1e-6)
Example #5
0
def test_maxrglm_gaussian():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    maxrglm_model = maxrglm(seed=12345, max_predictor_number=3)
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    best_r2_value = maxrglm_model.get_best_R2_values()
    best_predictor_names = maxrglm_model.get_best_model_predictors()

    # assert that model returned with one predictor found by maxrglm is the best
    one_pred_r2 = []
    for pred in my_x:
        x = [pred]
        m = glm(seed=12345)
        m.train(training_frame=d, x=x, y=my_y)
        one_pred_r2.append(m.r2())
    best_r2 = max(one_pred_r2)
    assert abs(best_r2-best_r2_value[0]) < 1e-6, "expected best r2: {0}, actual best r2:{1}.  They are different." \
                                              "".format(best_r2_value[0], best_r2)
    print("Best one predictor model uses predictor: {0}".format(
        best_predictor_names[0]))

    my_x3 = [["AGE", "RACE", "CAPSULE"], ["AGE", "RACE", "DCAPS"],
             ["AGE", "RACE", "PSA"], ["AGE", "RACE", "VOL"],
             ["AGE", "RACE", "DPROS"], ["AGE", "CAPSULE", "DCAPS"],
             ["AGE", "CAPSULE", "PSA"], ["AGE", "CAPSULE", "VOL"],
             ["AGE", "CAPSULE", "DPROS"], ["AGE", "DCAPS", "PSA"],
             ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "VOL"],
             ["AGE", "DCAPS", "DPROS"], ["AGE", "PSA", "VOL"],
             ["AGE", "PSA", "VOL"], ["AGE", "PSA", "DPROS"],
             ["AGE", "VOL", "DPROS"], ["RACE", "CAPSULE", "DCAPS"],
             ["RACE", "CAPSULE", "PSA"], ["RACE", "CAPSULE", "VOL"],
             ["RACE", "CAPSULE", "DPROS"], ["RACE", "DCAPS", "PSA"],
             ["RACE", "DCAPS", "VOL"], ["RACE", "DCAPS", "DPROS"],
             ["RACE", "PSA", "VOL"], ["RACE", "PSA", "DPROS"],
             ["RACE", "VOL", "DPROS"], ["CAPSULE", "DCAPS", "PSA"],
             ["CAPSULE", "DCAPS", "VOL"], ["CAPSULE", "DCAPS", "DPROS"],
             ["DCAPS", "PSA", "VOL"], ["DCAPS", "PSA", "DPROS"],
             ["DCAPS", "VOL", "DPROS"], ["PSA", "VOL", "DPROS"]]
    two_pred_r2 = []
    for pred2 in my_x3:
        x = pred2
        m = glm(seed=12345)
        m.train(training_frame=d, x=x, y=my_y)
        two_pred_r2.append(m.r2())
    best_r2_two_pred = max(two_pred_r2)
    assert abs(best_r2_two_pred-best_r2_value[2]) < 1e-6, "expected best r2: {0}, actual best r2:{1}.  They are different." \
                                                     "".format(best_r2_value[2], best_r2_two_pred)
    print("Best three predictors model uses predictors: {0}".format(
        best_predictor_names[2]))
Example #6
0
def test_maxrglm_cross_validation():

    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factorX:
        d[x] = d[x].asfactor()
    n_folds = 3
    fold_numbers = d.modulo_kfold_column(n_folds=n_folds)
    fold_numbers.set_names(["fold_numbers_modulo"])
    fold_numbers2 = d.kfold_column(n_folds=n_folds, seed=12345)
    fold_numbers2.set_names(["fold_numbers_kfold"])

    # append the fold_numbers column to the cars dataset
    d = d.cbind(fold_numbers)
    d = d.cbind(fold_numbers2)

    # cv model with fold assignment
    maxrglm_model_fa = maxrglm(seed=12345,
                               max_predictor_number=3,
                               fold_column="fold_numbers_modulo")
    maxrglm_model_fa.train(training_frame=d, x=my_x, y=my_y)
    best_r2_value_fa = maxrglm_model_fa.get_best_R2_values()

    maxrglm_model_fk = maxrglm(seed=12345,
                               max_predictor_number=3,
                               fold_column="fold_numbers_kfold")
    maxrglm_model_fk.train(training_frame=d, x=my_x, y=my_y)
    best_r2_value_fk = maxrglm_model_fk.get_best_R2_values()

    # both models should provide same best R2 values
    pyunit_utils.equal_two_arrays(best_r2_value_fa, best_r2_value_fk, eps=1e-6)
def test_maxrglm_gaussian_model_id():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    maxrglm_model = maxrglm(seed=12345, max_predictor_number=7)
    maxrglm_model.train(training_frame=d, x=my_x, y=my_y)
    resultFrame = maxrglm_model.result()
    numRows = resultFrame.nrows
    modelIDs = maxrglm_model._model_json["output"]["best_model_ids"]
    for ind in list(range(numRows)):
        model_frame = h2o.get_model(resultFrame["model_id"][ind, 0])
        pred_frame = model_frame.predict(d)
        model_id = h2o.get_model(modelIDs[ind]['name'])
        pred_id = model_id.predict(d)
        pyunit_utils.compare_frames_local(pred_frame, pred_id, prob=1)