def test_modelselection_gaussian_model_id():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_allsubsets = allsubsets_model.result()
    numRows = result_frame_allsubsets.nrows
    modelIDs_allsubsets = allsubsets_model._model_json["output"][
        "best_model_ids"]
    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_maxr = maxr_model.result()
    for ind in list(range(numRows)):
        model_from_frame_allsubsets = h2o.get_model(
            result_frame_allsubsets["model_id"][ind, 0])
        pred_frame_allsubsets = model_from_frame_allsubsets.predict(d)
        model_from_frame_allsubsets = h2o.get_model(
            modelIDs_allsubsets[ind]['name'])
        pred_id_allsubsets = model_from_frame_allsubsets.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_id_allsubsets,
                                          prob=1)
        model_from_frame_maxr = h2o.get_model(
            result_frame_maxr["model_id"][ind, 0])
        pred_frame_maxr = model_from_frame_maxr.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_frame_maxr,
                                          prob=1,
                                          tol=1e-6)
def test_modelselection_backward_serialization():
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    y = "GLEASON"
    x = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"]
    # make sure duplicate runs produce same results
    model_backward = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5,
                                      lambda_=0, theta=0.01)
    model_backward.train(training_frame=d, x=x, y=y)
    model_backward2 = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5,
                                    lambda_=0, theta=0.01)
    model_backward2.train(training_frame=d, x=x, y=y)
    result = model_backward.result()    # get result frame
    result2 = model_backward.result()    # get result frame
    pyunit_utils.compare_frames_local(result[2:5], result2[2:5], prob=1.0) # compare result from both models and they should the same

    num_models = result.nrows           # number of models built
    one_model = h2o.get_model(result["model_id"][num_models-1, 0])
    predict_frame = one_model.predict(d)
    tmpdir = tempfile.mkdtemp()
    file_dir = os.path.join(tmpdir, "predict.csv")
    h2o.download_csv(predict_frame, file_dir) # save one scoring frame
    model_path_backward = model_backward.download_model(tmpdir) # store the model

    h2o.remove_all()
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    loaded_backward_model = h2o.load_model(model_path_backward)    
    result_frame_backward = loaded_backward_model.result()

    model_from_frame_backward = h2o.get_model(result_frame_backward["model_id"][num_models-1, 0])
    pred_frame_backward = model_from_frame_backward.predict(d)
    pred_frame_model = h2o.import_file(file_dir)
    pyunit_utils.compare_frames_local(pred_frame_backward, pred_frame_model, prob=1.0)
Beispiel #3
0
def test_gaussian_result_frame_model_id():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    maxrsweep_model = modelSelection(seed=12345,
                                     max_predictor_number=7,
                                     mode="maxrsweep")
    maxrsweep_model.train(training_frame=d, x=my_x, y=my_y)

    # make sure results returned by maxr and maxrsweep are the same
    pyunit_utils.compare_frames_local(maxr_model.result()[2:4],
                                      maxrsweep_model.result()[2:4],
                                      prob=1.0,
                                      tol=1e-6)

    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_allsubsets = allsubsets_model.result()
    numRows = result_frame_allsubsets.nrows
    best_r2_allsubsets = allsubsets_model.get_best_R2_values()
    result_frame_maxr = maxr_model.result()
    best_r2_maxr = maxr_model.get_best_R2_values()
    for ind in list(range(numRows)):
        # r2 from attributes
        best_r2_value_allsubsets = best_r2_allsubsets[ind]
        one_model_allsubsets = h2o.get_model(
            result_frame_allsubsets["model_id"][ind, 0])
        pred_allsubsets = one_model_allsubsets.predict(d)
        print("last element of predictor frame: {0}".format(
            pred_allsubsets[pred_allsubsets.nrows - 1,
                            pred_allsubsets.ncols - 1]))
        assert pred_allsubsets.nrows == d.nrows, "expected dataset row: {0}, actual dataset row: " \
                                                 "{1}".format(pred_allsubsets.nrows, d.nrows)
        best_r2_value_maxr = best_r2_maxr[ind]
        one_model_maxr = h2o.get_model(result_frame_maxr["model_id"][ind, 0])
        pred_maxr = one_model_maxr.predict(d)
        pyunit_utils.compare_frames_local(
            pred_maxr, pred_allsubsets, prob=1,
            tol=1e-6)  # compare allsubsets and maxr results
        # r2 from result frame
        frame_r2_allsubsets = result_frame_allsubsets["best_r2_value"][ind, 0]
        # r2 from model
        model_r2_allsubsets = one_model_allsubsets.r2()
        # make sure all r2 are equal
        assert abs(best_r2_value_allsubsets-frame_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \
                                                                   "{1}".format(best_r2_value_allsubsets, frame_r2_allsubsets)
        assert abs(frame_r2_allsubsets-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, actual best r2: " \
                                                                    "{1}".format(model_r2_allsubsets, frame_r2_allsubsets)
        assert abs(best_r2_value_maxr-model_r2_allsubsets) < 1e-6, "expected best r2: {0}, maxr best r2: {1}" \
                                                             "".format(best_r2_value_maxr, model_r2_allsubsets)
def test_modelseletion_modelselection_cross_validation():

    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factorX:
        d[x] = d[x].asfactor()
    n_folds = 3

    maxr_model_r = modelSelection(seed=12345,
                                  max_predictor_number=3,
                                  nfolds=n_folds,
                                  fold_assignment="random",
                                  mode="maxr")
    maxr_model_r.train(training_frame=d, x=my_x, y=my_y)
    best_r2_maxr_r = maxr_model_r.get_best_R2_values()

    maxrglm_model_a = modelSelection(seed=12345,
                                     max_predictor_number=3,
                                     nfolds=n_folds,
                                     fold_assignment="auto",
                                     mode="maxr")
    maxrglm_model_a.train(training_frame=d, x=my_x, y=my_y)
    best_r2_maxr_a = maxrglm_model_a.get_best_R2_values()

    # both models should provide same best R2 values
    pyunit_utils.equal_two_arrays(best_r2_maxr_r, best_r2_maxr_a, eps=1e-6)

    allsubsets_model_r = modelSelection(seed=12345,
                                        max_predictor_number=3,
                                        nfolds=n_folds,
                                        fold_assignment="random",
                                        mode="allsubsets")
    allsubsets_model_r.train(training_frame=d, x=my_x, y=my_y)
    best_r2_allsubsets_r = allsubsets_model_r.get_best_R2_values()
    pyunit_utils.equal_two_arrays(
        best_r2_allsubsets_r, best_r2_maxr_r,
        eps=1e-6)  # maxr and allsubsets r2 should equal

    allsubsets_model_a = modelSelection(seed=12345,
                                        max_predictor_number=3,
                                        nfolds=n_folds,
                                        fold_assignment="auto",
                                        mode="allsubsets")
    allsubsets_model_a.train(training_frame=d, x=my_x, y=my_y)
    best_r2_allsubsets_a = allsubsets_model_a.get_best_R2_values()
    pyunit_utils.equal_two_arrays(
        best_r2_allsubsets_a, best_r2_maxr_a,
        eps=1e-6)  # maxr and allsubsets r2 should equal
def test_modelselection_validation():
    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factor_x = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factor_x:
        d[x] = d[x].asfactor()
    frames = d.split_frame(ratios=[0.8], seed=12345)
    train = frames[0]
    test = frames[1]
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=3,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=train, x=my_x, y=my_y)
    best_r2_allsubsets = allsubsets_model.get_best_R2_values()
    best_predictor_allsubsets = allsubsets_model.get_best_model_predictors()
    allsubsets_model_v = modelSelection(seed=12345,
                                        max_predictor_number=3,
                                        mode="allsubsets")
    allsubsets_model_v.train(training_frame=train,
                             validation_frame=test,
                             x=my_x,
                             y=my_y)
    best_r2_allsubsets_v = allsubsets_model_v.get_best_R2_values()
    best_predictor_allsubsets_v = allsubsets_model.get_best_model_predictors()

    maxr_model_v = modelSelection(seed=12345,
                                  max_predictor_number=3,
                                  mode="maxr")
    maxr_model_v.train(training_frame=train,
                       validation_frame=test,
                       x=my_x,
                       y=my_y)
    best_r2_maxr_v = maxr_model_v.get_best_R2_values()
    best_predictor_maxr_v = maxr_model_v.get_best_model_predictors()

    # R2 values are different between the two models
    numSet = len(best_r2_allsubsets)
    for index in range(numSet):
        one_best_predictor_allsubsets = best_predictor_allsubsets[index]
        one_best_predictor_v_allsubsets = best_predictor_allsubsets_v[index]
        one_best_r2_allsubsets = best_r2_allsubsets[index]
        one_best_r2_v_allsubsets = best_r2_allsubsets_v[index]
        best_r2_v_maxr = best_r2_maxr_v[index]
        if one_best_predictor_allsubsets == one_best_predictor_v_allsubsets:
            assert not (one_best_r2_allsubsets == one_best_r2_v_allsubsets
                        ), "R2 values should not equal"
            assert abs(one_best_r2_v_allsubsets-best_r2_v_maxr) < 1e-6, "allsubset best R2: {0}, maxr best R2: {1}.  They " \
                                                                    "are different.".format(one_best_r2_v_allsubsets,
                                                                                            best_r2_v_maxr)
def test_modelselection_gaussian_coefs():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    coefs_allsubsets = allsubsets_model.coef()
    coefs_norm_allsubsets = allsubsets_model.coef_norm()
    maxrsweep_model = modelSelection(seed=12345,
                                     max_predictor_number=7,
                                     mode="maxrsweep")
    maxrsweep_model.train(training_frame=d, x=my_x, y=my_y)
    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    # make sure results returned by maxr and maxrsweep are the same
    pyunit_utils.compare_frames_local(maxr_model.result()[2:4],
                                      maxrsweep_model.result()[2:4],
                                      prob=1.0,
                                      tol=1e-6)
    coefs_maxr = maxr_model.coef()
    coefs_norm_maxr = maxr_model.coef_norm()

    for ind in list(range(len(coefs_allsubsets))):
        one_coef_allsubsets = coefs_allsubsets[ind]
        one_coef_norm_allsubsets = coefs_norm_allsubsets[ind]
        one_coef_maxr = coefs_maxr[ind]
        one_coef_norm_maxr = coefs_norm_maxr[ind]
        # coefficients obtained from accessing model_id, generate model and access the model coeffs
        one_model = h2o.get_model(allsubsets_model._model_json["output"]
                                  ["best_model_ids"][ind]['name'])
        model_coef = one_model.coef()
        model_coef_norm = one_model.coef_norm()
        # get coefficients of individual predictor subset size
        subset_size = ind + 1
        one_model_coef = allsubsets_model.coef(subset_size)
        one_model_coef_norm = allsubsets_model.coef_norm(subset_size)

        # check coefficient dicts are equal
        pyunit_utils.assertCoefDictEqual(one_coef_allsubsets, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_coef_norm_allsubsets,
                                         model_coef_norm, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef, model_coef, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef_norm, model_coef_norm,
                                         1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef, one_coef_maxr, 1e-6)
        pyunit_utils.assertCoefDictEqual(one_model_coef_norm,
                                         one_coef_norm_maxr, 1e-6)
Beispiel #7
0
def test_modelselection_backward_gaussian():
    predictor_elimination_order = ["C72", "C70", "C69", "C48", "C38", "C96", "C10", "C29", "C22", "C100", "C82", "C56", 
                                   "C92", "C99", "C57"]
    eliminated_p_values = [0.9822, 0.9054, 0.7433, 0.4095, 0.1679, 0.1551, 0.0438, 0.0119, 0.0107, 0.0094, 0.0099, 
                           0.0066, 0.0003, 0.0002, 0.0002]
    d = h2o.import_file(path=pyunit_utils.locate("bigdata/laptop/model_selection/maxrglm100Cols50KRowsWeighted.csv"))
    my_y = "response"
    my_x = d.names
    my_x.remove(my_y)
    my_x.remove("weight")
    min_predictor_num = 100-len(predictor_elimination_order)
    model_backward = modelSelection(seed=12345, min_predictor_number=min_predictor_num, mode="backward", family='gaussian',
                                weights_column='weight')
    model_backward.train(training_frame=d, x=my_x, y=my_y)
    # check predictor deletion order same as in predictor_elimination_order
    predictor_orders = model_backward._model_json['output']['best_model_predictors']
    num_models = len(predictor_orders)
    counter = 0
    for ind in list(range(num_models-1, 0, -1)):
        pred_large = model_backward._model_json["output"]["best_model_predictors"][ind]
        pred_small = model_backward._model_json["output"]["best_model_predictors"][ind-1]
        predictor_removed = set(pred_large).symmetric_difference(pred_small).pop()
        assert predictor_removed==predictor_elimination_order[counter], "expected eliminated predictor {0}, " \
                                                                        "actual eliminated predictor {1}".format(predictor_elimination_order[counter], predictor_removed)
        
        predictor_removed_index = model_backward._model_json["output"]["coefficient_names"][ind].index(predictor_removed)
        removed_pvalue = round(model_backward._model_json["output"]["coef_p_values"][ind][predictor_removed_index], 4)
        # assert p-values of coefficients removed by h2o equals to customer ones
        assert abs(removed_pvalue-eliminated_p_values[counter]) < 1e-6, \
            "Expected p-value of eliminated coefficient: {0}. Actual: {1}. They are very different." \
            "".format(eliminated_p_values[counter], removed_pvalue)
        counter += 1
        coefs = model_backward.coef(len(pred_large)) # check coefficients result correct length
        assert len(coefs) == len(pred_large), "Expected coef length: {0}, Actual: {1}".format(len(coefs), len(pred_large))
Beispiel #8
0
def test_modelselection_serialization():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    allsubsets_model = modelSelection(seed=12345,
                                      max_predictor_number=7,
                                      mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    tmpdir = tempfile.mkdtemp()
    model_path_allsubsets = allsubsets_model.download_model(tmpdir)
    maxr_model = modelSelection(seed=12345,
                                max_predictor_number=7,
                                mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    model_path_maxr = maxr_model.download_model(tmpdir)

    h2o.remove_all()
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    loaded_allsubsets_model = h2o.load_model(model_path_allsubsets)
    result_frame_allsubsets = loaded_allsubsets_model.result()
    numRows = result_frame_allsubsets.nrows
    modelIDs_allsubsets = loaded_allsubsets_model._model_json["output"][
        "best_model_ids"]
    loaded_maxr_model = h2o.load_model(model_path_maxr)
    modelIDs_maxr = loaded_allsubsets_model._model_json["output"][
        "best_model_ids"]
    for ind in list(range(numRows)):
        model_from_frame_allsubsets = h2o.get_model(
            result_frame_allsubsets["model_id"][ind, 0])
        pred_frame_allsubsets = model_from_frame_allsubsets.predict(d)
        model_from_id_allsubsets = h2o.get_model(
            modelIDs_allsubsets[ind]['name'])
        pred_id_allsubsets = model_from_id_allsubsets.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_id_allsubsets,
                                          prob=1)
        model_from_id_maxr = h2o.get_model(modelIDs_maxr[ind]['name'])
        pred_id_maxr = model_from_id_maxr.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets,
                                          pred_id_maxr,
                                          prob=1)
def test_modelselection_backward_gaussian():
    predictor_elimination_order = ["C15", "C33", "C164", "C144", "C27"]
    eliminated_p_values = [0.6702, 0.6663, 0.0157, 0.0026, 0.0002]
    d = h2o.import_file(path=pyunit_utils.locate(
        "bigdata/laptop/model_selection/backwardBinomial200C50KRowsWeighted.csv"
    ))
    my_y = "response"
    my_x = d.names
    my_x.remove(my_y)
    my_x.remove("weight")
    min_predictor_num = 200 - len(predictor_elimination_order)
    model_backward = modelSelection(seed=12345,
                                    min_predictor_number=min_predictor_num,
                                    mode="backward",
                                    family='binomial',
                                    link='logit',
                                    weights_column='weight')
    model_backward.train(training_frame=d, x=my_x, y=my_y)
    # check predictor deletion order same as in predictor_elimination_order
    predictor_orders = model_backward._model_json['output'][
        'best_model_predictors']
    num_models = len(predictor_orders)
    counter = 0
    pred_ele = []
    pred_pvalue = []
    for ind in list(range(num_models - 1, 0, -1)):
        pred_large = model_backward._model_json["output"][
            "best_model_predictors"][ind]
        pred_small = model_backward._model_json["output"][
            "best_model_predictors"][ind - 1]
        predictor_removed = set(pred_large).symmetric_difference(
            pred_small).pop()
        pred_ele.append(predictor_removed)
        predictor_removed_index = model_backward._model_json["output"][
            "coefficient_names"][ind].index(predictor_removed)
        pred_pvalue.append(
            round(
                model_backward._model_json["output"]["coef_p_values"][ind]
                [predictor_removed_index], 4))
        counter += 1
        coefs = model_backward.coef(
            len(pred_large))  # check coefficients result correct length
        assert len(coefs) == len(
            pred_large) + 1, "Expected coef length: {0}, Actual: {1}".format(
                len(coefs),
                len(pred_large) + 1)
    common_elimination = list(set(predictor_elimination_order) & set(pred_ele))
    assert len(common_elimination) >= 2
    print("Expected predictor elimination order: {0}".format(
        predictor_elimination_order))
    print("Expected predictor p-values: {0}".format(eliminated_p_values))
    print("Predictor elimination order: {0}".format(pred_ele))
    print("Predictor p-values: {0}".format(pred_pvalue))
def test_modelselection_backward_gaussian():
    predictor_elimination_order = ['C33', 'C24', 'C164', 'C66', 'C15']
    eliminated_p_values = [0.9711, 0.0694, 0.0388, 0.0127, 0.0009]
    tst_data = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv"))
    predictors = tst_data.columns[0:-1]
    response_col = 'response'
    weight = 'wt'
    tst_data['wt'] = 1
    tst_data[tst_data['response'] == 1, 'wt'] = 100
    tst_data['response'] = tst_data['response'].asfactor()

    min_predictor_num = 200 - len(predictor_elimination_order)
    model_backward = modelSelection(family='binomial',
                                    weights_column=weight,
                                    mode='backward',
                                    min_predictor_number=min_predictor_num)
    model_backward.train(training_frame=tst_data, x=predictors, y=response_col)
    # check predictor deletion order same as in predictor_elimination_order
    predictor_orders = model_backward._model_json['output'][
        'best_model_predictors']
    num_models = len(predictor_orders)
    counter = 0
    pred_ele = []
    pred_pvalue = []
    for ind in list(range(num_models - 1, 0, -1)):
        pred_large = model_backward._model_json["output"][
            "best_model_predictors"][ind]
        pred_small = model_backward._model_json["output"][
            "best_model_predictors"][ind - 1]
        predictor_removed = set(pred_large).symmetric_difference(
            pred_small).pop()
        pred_ele.append(predictor_removed)
        predictor_removed_index = model_backward._model_json["output"][
            "coefficient_names"][ind].index(predictor_removed)
        pred_pvalue.append(
            round(
                model_backward._model_json["output"]["coef_p_values"][ind]
                [predictor_removed_index], 4))
        counter += 1
        coefs = model_backward.coef(
            len(pred_large))  # check coefficients result correct length
        assert len(coefs) == len(
            pred_large), "Expected coef length: {0}, Actual: {1}".format(
                len(coefs), len(pred_large))
    common_elimination = list(set(predictor_elimination_order) & set(pred_ele))
    assert len(common_elimination) == len(pred_ele)
    pyunit_utils.equal_two_arrays(pred_pvalue,
                                  eliminated_p_values,
                                  tolerance=1e-6)
def test_modelselection_cv_result_frame_model_id():

    d = h2o.import_file(path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11", "C12", "C13", "C14", "C15", "C16",
           "C17", "C18", "C19", "C20"]
    factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factorX:
        d[x] = d[x].asfactor()
    n_folds = 3

    allsubsets_model = modelSelection(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="auto", 
                                   mode="allsubsets")
    allsubsets_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_allsubsets = allsubsets_model.result()
    maxr_model = modelSelection(seed=12345, max_predictor_number=3, nfolds=n_folds, fold_assignment="auto",
                                      mode="maxr")
    maxr_model.train(training_frame=d, x=my_x, y=my_y)
    result_frame_maxr = maxr_model.result()
    
    numRows = result_frame_allsubsets.nrows
    modelIDs_allsubsets = allsubsets_model._model_json["output"]["best_model_ids"]
    modelIDs_maxr = maxr_model._model_json["output"]["best_model_ids"]
    for ind in list(range(numRows)):
        model_allsubsets_from_frame = h2o.get_model(result_frame_allsubsets["model_id"][ind, 0])
        pred_frame_allsubsets = model_allsubsets_from_frame.predict(d)
        model_allsubsets_from_id = h2o.get_model(modelIDs_allsubsets[ind]['name'])
        pred_id_allsubsets = model_allsubsets_from_id.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_allsubsets, prob=1)
        # compare results from maxr with allsubsets
        model_maxr_from_frame = h2o.get_model(result_frame_maxr["model_id"][ind, 0])
        pred_frame_maxr = model_maxr_from_frame.predict(d)
        model_maxrs_from_id = h2o.get_model(modelIDs_maxr[ind]['name'])
        pred_id_maxr = model_maxrs_from_id.predict(d)
        pyunit_utils.compare_frames_local(pred_frame_maxr, pred_id_maxr, prob=1)
        pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_maxr, prob=1)
def test_modelselection_cross_validation():

    d = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    my_y = "C21"
    my_x = [
        "C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10", "C11",
        "C12", "C13", "C14", "C15", "C16", "C17", "C18", "C19", "C20"
    ]
    factorX = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for x in factorX:
        d[x] = d[x].asfactor()
    n_folds = 3
    fold_numbers = d.modulo_kfold_column(n_folds=n_folds)
    fold_numbers.set_names(["fold_numbers_modulo"])
    fold_numbers2 = d.kfold_column(n_folds=n_folds, seed=12345)
    fold_numbers2.set_names(["fold_numbers_kfold"])

    # append the fold_numbers column to the cars dataset
    d = d.cbind(fold_numbers)
    d = d.cbind(fold_numbers2)

    # cv model with fold assignment
    allsubsets_model_fa = modelSelection(seed=12345,
                                         max_predictor_number=3,
                                         fold_column="fold_numbers_modulo",
                                         mode="allsubsets")
    allsubsets_model_fa.train(training_frame=d, x=my_x, y=my_y)
    best_r2_allsubsets_fa = allsubsets_model_fa.get_best_R2_values()

    allsubsets_model_fk = modelSelection(seed=12345,
                                         max_predictor_number=3,
                                         fold_column="fold_numbers_kfold",
                                         mode="allsubsets")
    allsubsets_model_fk.train(training_frame=d, x=my_x, y=my_y)
    best_r2_allsubsets_fk = allsubsets_model_fk.get_best_R2_values()

    # both models should provide same best R2 values
    pyunit_utils.equal_two_arrays(best_r2_allsubsets_fa,
                                  best_r2_allsubsets_fk,
                                  eps=1e-6)

    # cv model with fold assignment
    maxr_model_fa = modelSelection(seed=12345,
                                   max_predictor_number=3,
                                   fold_column="fold_numbers_modulo",
                                   mode="maxr")
    maxr_model_fa.train(training_frame=d, x=my_x, y=my_y)
    best_r2_maxr_fa = maxr_model_fa.get_best_R2_values()

    maxr_model_fk = modelSelection(seed=12345,
                                   max_predictor_number=3,
                                   fold_column="fold_numbers_kfold",
                                   mode="maxr")
    maxr_model_fk.train(training_frame=d, x=my_x, y=my_y)
    best_r2_maxr_fk = maxr_model_fk.get_best_R2_values()

    # both models should provide same best R2 values
    pyunit_utils.equal_two_arrays(best_r2_allsubsets_fa,
                                  best_r2_maxr_fa,
                                  eps=1e-6)
    pyunit_utils.equal_two_arrays(best_r2_allsubsets_fk,
                                  best_r2_maxr_fk,
                                  eps=1e-6)
Beispiel #13
0
def test_modelselection_gaussian():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    my_y = "GLEASON"
    my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    model_maxr = modelSelection(seed=12345,
                                max_predictor_number=3,
                                mode="maxr")
    model_maxr.train(training_frame=d, x=my_x, y=my_y)
    model_allsubsets = modelSelection(seed=12345,
                                      max_predictor_number=3,
                                      mode="allsubsets")
    model_allsubsets.train(training_frame=d, x=my_x, y=my_y)
    best_r2_value_allsubsets = model_allsubsets.get_best_R2_values()
    best_predictor_names_allsubsets = model_allsubsets.get_best_model_predictors(
    )
    best_r2_value_maxr = model_maxr.get_best_R2_values()

    # assert that model returned with one predictor found by modelselection is the best by comparing it to manual training result
    one_pred_r2 = []
    for pred in my_x:
        x = [pred]
        m = glm(seed=12345)
        m.train(training_frame=d, x=x, y=my_y)
        one_pred_r2.append(m.r2())
    best_r2 = max(one_pred_r2)
    assert abs(best_r2-best_r2_value_allsubsets[0]) < 1e-6, "expected best r2: {0}, allsubset: actual best r2:{1}. " \
                                                            " They are different.".format(best_r2, best_r2_value_allsubsets[0])
    assert abs(best_r2-best_r2_value_maxr[0]) < 1e-6, "expected best r2: {0}, maxr: actual best r2:{1}. " \
                                                      " They are different.".format(best_r2, best_r2_value_maxr[0])
    assert abs(best_r2_value_allsubsets[0]-best_r2_value_maxr[0]) < 1e-6, "allsubset best r2: {0}, maxr best r2:{1}. " \
                                                                          " They are different." \
                                                                          "".format(best_r2_value_allsubsets[0],
                                                                                    best_r2_value_maxr[0])

    print("Best one predictor model uses predictor: {0}".format(
        best_predictor_names_allsubsets[0]))

    my_x3 = [["AGE", "RACE", "CAPSULE"], ["AGE", "RACE", "DCAPS"],
             ["AGE", "RACE", "PSA"], ["AGE", "RACE", "VOL"],
             ["AGE", "RACE", "DPROS"], ["AGE", "CAPSULE", "DCAPS"],
             ["AGE", "CAPSULE", "PSA"], ["AGE", "CAPSULE", "VOL"],
             ["AGE", "CAPSULE", "DPROS"], ["AGE", "DCAPS", "PSA"],
             ["AGE", "DCAPS", "PSA"], ["AGE", "DCAPS", "VOL"],
             ["AGE", "DCAPS", "DPROS"], ["AGE", "PSA", "VOL"],
             ["AGE", "PSA", "VOL"], ["AGE", "PSA", "DPROS"],
             ["AGE", "VOL", "DPROS"], ["RACE", "CAPSULE", "DCAPS"],
             ["RACE", "CAPSULE", "PSA"], ["RACE", "CAPSULE", "VOL"],
             ["RACE", "CAPSULE", "DPROS"], ["RACE", "DCAPS", "PSA"],
             ["RACE", "DCAPS", "VOL"], ["RACE", "DCAPS", "DPROS"],
             ["RACE", "PSA", "VOL"], ["RACE", "PSA", "DPROS"],
             ["RACE", "VOL", "DPROS"], ["CAPSULE", "DCAPS", "PSA"],
             ["CAPSULE", "DCAPS", "VOL"], ["CAPSULE", "DCAPS", "DPROS"],
             ["DCAPS", "PSA", "VOL"], ["DCAPS", "PSA", "DPROS"],
             ["DCAPS", "VOL", "DPROS"], ["PSA", "VOL", "DPROS"]]
    two_pred_r2 = []
    for pred2 in my_x3:
        x = pred2
        m = glm(seed=12345)
        m.train(training_frame=d, x=x, y=my_y)
        two_pred_r2.append(m.r2())
    best_r2_two_pred = max(two_pred_r2)
    assert abs(best_r2_two_pred-best_r2_value_allsubsets[2]) < 1e-6, "expected best r2: {0}, allsubsets: actual best " \
                                                                     "r2:{1}.  They are different." \
                                                     "".format(best_r2_two_pred, best_r2_value_allsubsets[2])
    assert abs(best_r2_two_pred-best_r2_value_maxr[2]) < 1e-6, "expected best r2: {0}, maxr: actual best " \
                                                                     "r2:{1}.  They are different." \
                                                                     "".format(best_r2_two_pred, best_r2_value_maxr[2])
    assert abs(best_r2_value_allsubsets[2]-best_r2_value_maxr[2]) < 1e-6, "allsubset best r2: {0}, maxr: actual best " \
                                                               "r2:{1}.  They are different." \
                                                               "".format(best_r2_value_allsubsets[2], best_r2_value_maxr[2])
    print("Best three predictors model uses predictors: {0}".format(
        best_predictor_names_allsubsets[2]))