def glm_alpha_array_with_lambda_search_cv():
    # read in the dataset and construct training set (and validation set)
    print("Testing glm cross-validation with alpha array, lambda_search for multiomial models.")
    h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C11"
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    # build model with CV but no validation dataset
    cv_model = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3)
    cv_model.train(training_frame=training_data,x=myX,y=myY)
    cv_r = glm.getGLMRegularizationPath(cv_model)
    # build model with CV and with validation dataset
    cv_model_valid = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3)
    cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY)
    cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid)

    for l in range(0,len(cv_r['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
Example #2
0
def buildModelCheckpointing(training_frame, x_indices, y_index, family,
                            solver):
    split_frames = training_frame.split_frame(ratios=[0.9], seed=12345)
    model = H2OGeneralizedLinearEstimator(family=family,
                                          max_iterations=7,
                                          solver=solver)
    model.train(training_frame=split_frames[0],
                x=x_indices,
                y=y_index,
                validation_frame=split_frames[1])
    modelCheckpoint = H2OGeneralizedLinearEstimator(family=family,
                                                    checkpoint=model.model_id,
                                                    solver=solver)
    modelCheckpoint.train(training_frame=split_frames[0],
                          x=x_indices,
                          y=y_index,
                          validation_frame=split_frames[1])

    modelLong = H2OGeneralizedLinearEstimator(
        family=family, solver=solver)  # allow to run to completion
    modelLong.train(training_frame=split_frames[0],
                    x=x_indices,
                    y=y_index,
                    validation_frame=split_frames[1])

    pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(),
                                       modelLong.coef(),
                                       tol=5e-2)
def test_glm_backward_compare():
    tst_data = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv"))
    predictors = tst_data.columns[0:-1]
    response_col = 'response'
    weight = 'wt'
    tst_data['wt'] = 1
    tst_data[tst_data['response'] == 1, 'wt'] = 100
    tst_data['response'] = tst_data['response'].asfactor()
    min_predictor_num = 200
    backward_model = H2OModelSelectionEstimator(
        family='binomial',
        weights_column=weight,
        mode='backward',
        min_predictor_number=min_predictor_num)
    backward_model.train(predictors, response_col, training_frame=tst_data)
    backward_model_coeff = backward_model.coef()[0]
    glm_model = H2OGeneralizedLinearEstimator(family='binomial',
                                              lambda_=0,
                                              compute_p_values=True,
                                              weights_column=weight)
    glm_model.train(predictors, response_col, training_frame=tst_data)
    glm_coeff = glm_model.coef()
    pyunit_utils.assertEqualCoeffDicts(glm_coeff,
                                       backward_model_coeff,
                                       tol=1e-6)
Example #4
0
def glm_alpha_array_lambda_null():
    # first test: compare coefficients and deviance
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial',alpha=[0.1,0.5,0.9],solver='COORDINATE_DESCENT')
    mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = ["alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train"]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    m2 = glm.makeGLMModel(model=mL,coefs=r['coefficients'][best_submodel_index])
    dev1 = r['explained_deviance_train'][best_submodel_index]
    p2 = m2.model_performance(d)
    dev2 = 1-p2.residual_deviance()/p2.null_deviance()
    print(dev1," =?= ",dev2)
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0,len(r['lambdas'])):
        m = glm(family='binomial',alpha=[r['alphas'][l]],Lambda=[r['lambdas'][l]],solver='COORDINATE_DESCENT')
        m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
        mr = glm.getGLMRegularizationPath(m)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef())
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm())
        p = m.model_performance(d)
        devm = 1-p.residual_deviance()/p.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr)
        if (l == best_submodel_index): # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(m._model_json["output"]["training_metrics"],
                                                 mL._model_json["output"]["training_metrics"])
        else: # for other submodel, should have worse residual_deviance() than best submodel
            assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \
                                                                    "residual_deviance()!"
def grab_lambda_min():
    boston = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv"))

    # set the predictor names and the response column name
    predictors = boston.columns[:-1]
    # set the response column to "medv", the median value of owner-occupied homes in $1000's
    response = "medv"

    # convert the chas column to a factor (chas = Charles River dummy variable (= 1 if tract bounds river; 0 otherwise))
    boston['chas'] = boston['chas'].asfactor()

    # split into train and validation sets
    train, valid = boston.split_frame(ratios = [.8], seed=1234)
    boston_glm = H2OGeneralizedLinearEstimator(lambda_search = True, seed=1234, cold_start=True)
    boston_glm.train(x = predictors, y = response, training_frame = train, validation_frame = valid)   
    r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(boston_glm)

    for l in range(0,len(r['lambdas'])):
        m = H2OGeneralizedLinearEstimator(alpha=[r['alphas'][l]],Lambda=r['lambdas'][l],
                                          solver='COORDINATE_DESCENT')
        m.train(x = predictors, y = response, training_frame = train, validation_frame = valid)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-6)
def buildModelCheckpointing(training_frame, x_indices, y_index, family, solver,
                            cold_start):
    split_frames = training_frame.split_frame(ratios=[0.9], seed=12345)
    model = H2OGeneralizedLinearEstimator(family=family,
                                          max_iterations=3,
                                          solver=solver,
                                          lambda_search=True,
                                          cold_start=cold_start)
    model.train(training_frame=split_frames[0],
                x=x_indices,
                y=y_index,
                validation_frame=split_frames[1])
    modelCheckpoint = H2OGeneralizedLinearEstimator(family=family,
                                                    checkpoint=model.model_id,
                                                    solver=solver,
                                                    lambda_search=True,
                                                    cold_start=cold_start)
    modelCheckpoint.train(training_frame=split_frames[0],
                          x=x_indices,
                          y=y_index,
                          validation_frame=split_frames[1])
    modelLong = H2OGeneralizedLinearEstimator(family=family,
                                              solver=solver,
                                              lambda_search=True,
                                              cold_start=cold_start)
    modelLong.train(training_frame=split_frames[0],
                    x=x_indices,
                    y=y_index,
                    validation_frame=split_frames[1])

    pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(),
                                       modelLong.coef(),
                                       tol=1e-6)
Example #7
0
def glm_alpha_lambda_arrays_cv():
    print("Testing glm cross-validation with alpha array, lambda array for binomial models.")
    h2o_data = h2o.import_file(
        path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    
    # choices made in model_all and model_xval should be the same since they should be using xval metrics
    model_all = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True)
    model_all.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data)
    model_all_rpath = glm.getGLMRegularizationPath(model_all)
    model_xval =  glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True)
    model_xval.train(x=myX, y=myY, training_frame = training_data)
    model_xval_rpath = glm.getGLMRegularizationPath(model_xval)

    for l in range(0,len(model_all_rpath['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients'][l], model_xval_rpath['coefficients'][l], tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients_std'][l], model_xval_rpath['coefficients_std'][l], tol=1e-6)
Example #8
0
def glm_alpha_arrays_null_lambda_cv():
    print("Testing glm cross-validation with alpha array, default lambda values for binomial models.")
    h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"]
    for cname in enum_columns:
        h2o_data[cname] = h2o_data[cname]
    myY = "C21"
    h2o_data["C21"] = h2o_data["C21"].asfactor()
    myX = h2o_data.names.remove(myY)
    data_frames = h2o_data.split_frame(ratios=[0.8])
    training_data = data_frames[0]
    test_data = data_frames[1]
    
    # build model with CV but no validation dataset
    cv_model = glm(family='binomial',alpha=[0.1,0.5,0.9], nfolds = 3, fold_assignment="modulo")
    cv_model.train(training_frame=training_data,x=myX,y=myY)
    cv_r = glm.getGLMRegularizationPath(cv_model)
    # build model with CV and with validation dataset
    cv_model_valid = glm(family='binomial',alpha=[0.1,0.5,0.9], nfolds = 3, fold_assignment="modulo")
    cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY)
    cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid)

    for l in range(0,len(cv_r['lambdas'])):
        print("comparing coefficients for submodel {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6)
        pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
def glm_alpha_lambda_arrays():
    # compare coefficients and deviance when only training dataset is available
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    for ind in range(10):
        train[ind] = train[ind].asfactor()
    train["C21"] = train["C21"].asfactor()
    frames = train.split_frame(ratios=[0.8], seed=12345)
    d = frames[0]
    d_test = frames[1]
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]

    # compare results when validation dataset is present
    mLVal = glm(family='binomial',
                alpha=[0.1, 0.5],
                lambda_search=True,
                solver='COORDINATE_DESCENT',
                nlambdas=3)  # train with validations set
    mLVal.train(training_frame=d,
                x=list(range(20)),
                y=20,
                validation_frame=d_test)
    rVal = glm.getGLMRegularizationPath(mLVal)
    best_submodel_indexVal = mLVal._model_json["output"]["best_submodel_index"]
    m2Val = glm.makeGLMModel(
        model=mLVal, coefs=rVal['coefficients'][best_submodel_indexVal])
    dev1Val = rVal['explained_deviance_valid'][best_submodel_indexVal]
    p2Val = m2Val.model_performance(d_test)
    dev2Val = 1 - p2Val.residual_deviance() / p2Val.null_deviance()
    assert abs(dev1Val - dev2Val) < 1e-6
    for l in range(0, len(rVal['lambdas'])):
        m = glm(family='binomial',
                alpha=[rVal['alphas'][l]],
                Lambda=rVal['lambdas'][l],
                solver='COORDINATE_DESCENT')
        m.train(training_frame=d,
                x=list(range(20)),
                y=20,
                validation_frame=d_test)
        mr = glm.getGLMRegularizationPath(m)
        p = m.model_performance(d_test)
        cs = rVal['coefficients'][l]
        cs_norm = rVal['coefficients_std'][l]
        print("Comparing submodel index {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-1)
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-1)
        pyunit_utils.assertEqualRegPaths(regKeys, rVal, l, mr, tol=1e-3)
        dVal = 1 - p.residual_deviance() / p.null_deviance()
        if l == best_submodel_indexVal:  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["validation_metrics"],
                mLVal._model_json["output"]["validation_metrics"],
                tol=1e-2)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert dVal <= dev2Val, "Best submodel does not have highest explained deviance_valid for submodel: !".format(
                l)
def test_gridsearch():
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv"))
    h2o_data['response'] = h2o_data['response'].asfactor()
    h2o_data['C3'] = h2o_data['C3'].asfactor()
    h2o_data['C7'] = h2o_data['C7'].asfactor()
    h2o_data['C8'] = h2o_data['C8'].asfactor()
    h2o_data['C10'] = h2o_data['C10'].asfactor()
    names = h2o_data.names
    myY = "response"
    myX = names.remove(myY)
    search_criteria = {'strategy': 'Cartesian'}
    hyper_parameters = {
        'lambda': [1, 2],
        'subspaces': [{
            'scale': [[0.001], [0.0002]],
            'num_knots': [[5], [10]],
            'bs': [[1], [0]],
            'gam_columns': [[["c_0"]], [["c_1"]]]
        }, {
            'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]],
            'bs': [[1, 1, 1], [0, 1, 1]],
            'num_knots': [[5, 10, 12], [6, 11, 13]],
            'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                            [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]
        }]
    }
    hyper_parameters2 = {
        'lambda': [1, 2],
        'subspaces': [{
            'scale': [[0.001], [0.0002]],
            'num_knots': [[5], [10]],
            'bs': [[1], [0]],
            'gam_columns': [[["c_0"]], [["c_1"]]]
        }, {
            'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]],
            'bs': [[1, 1, 1], [0, 1, 1]],
            'num_knots': [[5, 10, 12], [6, 11, 13]],
            'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]],
                            ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]]
        }]
    }
    h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
        family="binomial", keep_gam_cols=True),
                              hyper_params=hyper_parameters,
                              search_criteria=search_criteria)
    h2o_model.train(x=myX, y=myY, training_frame=h2o_data)
    h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator(
        family="binomial", keep_gam_cols=True),
                               hyper_params=hyper_parameters2,
                               search_criteria=search_criteria)
    h2o_model2.train(x=myX, y=myY, training_frame=h2o_data)
    # compare two models by checking their coefficients.  They should be the same
    for index in range(0, len(h2o_model)):
        model1 = h2o_model[index]
        model2 = h2o_model2[index]
        pyunit_utils.assertEqualCoeffDicts(model1.coef(),
                                           model2.coef(),
                                           tol=1e-6)
Example #11
0
def test_gam_model_predict():
    covtype_df = h2o.import_file(
        pyunit_utils.locate("bigdata/laptop/covtype/covtype.full.csv"))
    train, valid = covtype_df.split_frame([0.9], seed=1234)

    #Prepare predictors and response columns
    covtype_X = covtype_df.col_names[:
                                     -1]  #last column is Cover_Type, our desired response variable
    covtype_y = covtype_df.col_names[-1]
    # build model with cross validation and no validation dataset
    gam_multi = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                solver='IRLSM',
                                                gam_columns=["Slope"],
                                                scale=[0.0001],
                                                num_knots=[5],
                                                standardize=True,
                                                nfolds=2,
                                                fold_assignment='modulo',
                                                alpha=[0.9, 0.5, 0.1],
                                                lambda_search=True,
                                                nlambdas=5,
                                                max_iterations=3)
    gam_multi.train(covtype_X, covtype_y, training_frame=train)
    # build model with cross validation and with validation dataset
    gam_multi_valid = H2OGeneralizedAdditiveEstimator(family='multinomial',
                                                      solver='IRLSM',
                                                      gam_columns=["Slope"],
                                                      scale=[0.0001],
                                                      num_knots=[5],
                                                      standardize=True,
                                                      nfolds=2,
                                                      fold_assignment='modulo',
                                                      alpha=[0.9, 0.5, 0.1],
                                                      lambda_search=True,
                                                      nlambdas=5,
                                                      max_iterations=3)
    gam_multi_valid.train(covtype_X,
                          covtype_y,
                          training_frame=train,
                          validation_frame=valid)
    # model should yield the same coefficients in both case
    gam_multi_coef = gam_multi.coef()
    gam_multi_valid_coef = gam_multi_valid.coef()
    pyunit_utils.assertEqualCoeffDicts(gam_multi_coef['coefficients'],
                                       gam_multi_valid_coef['coefficients'])
Example #12
0
def buildModelCheckpointing(training_frame, x_indices, y_index, family, solver,
                            cold_start, nlambdas):
    split_frames = training_frame.split_frame(ratios=[0.9], seed=12345)
    model = H2OGeneralizedLinearEstimator(family=family,
                                          max_iterations=3,
                                          solver=solver,
                                          lambda_search=True,
                                          cold_start=cold_start,
                                          nlambdas=nlambdas)
    model.train(training_frame=split_frames[0],
                x=x_indices,
                y=y_index,
                validation_frame=split_frames[1])
    modelCheckpoint = H2OGeneralizedLinearEstimator(family=family,
                                                    checkpoint=model.model_id,
                                                    solver=solver,
                                                    lambda_search=True,
                                                    cold_start=cold_start,
                                                    nlambdas=nlambdas)
    modelCheckpoint.train(training_frame=split_frames[0],
                          x=x_indices,
                          y=y_index,
                          validation_frame=split_frames[1])
    # allow to run to completion
    modelLong = H2OGeneralizedLinearEstimator(family=family,
                                              solver=solver,
                                              lambda_search=True,
                                              cold_start=cold_start,
                                              nlambdas=nlambdas)
    modelLong.train(training_frame=split_frames[0],
                    x=x_indices,
                    y=y_index,
                    validation_frame=split_frames[1])

    checkpointCoef = modelCheckpoint.coef()
    longCoef = modelLong.coef()
    for key in longCoef.keys():
        pyunit_utils.assertEqualCoeffDicts(checkpointCoef[key],
                                           longCoef[key],
                                           tol=1e-6)
    def match_models(self):
        for model in self.manual_gam_models:
            scale = model.actual_params['scale']
            gam_columns = model.actual_params['gam_columns']
            num_knots = model.actual_params['num_knots']
            lambda_ = model.actual_params['lambda']
            bsVal = model.actual_params['bs']
            for grid_search_model in self.h2o_model.models:
                if grid_search_model.actual_params['gam_columns'] == gam_columns \
                    and grid_search_model.actual_params['scale'] == scale \
                    and grid_search_model.actual_params['num_knots'] == num_knots \
                    and grid_search_model.actual_params['bs'] == bsVal \
                    and grid_search_model.actual_params['lambda'] == lambda_:
                    self.num_grid_models += 1
                    print("grid model number "+str(self.num_grid_models))
                    print("gridSearch model coefficients")
                    print(grid_search_model.coef())
                    print("manual model coefficients")
                    print(model.coef())
                    pyunit_utils.assertEqualCoeffDicts(grid_search_model.coef(), model.coef(), tol=1e-6)
                    break

        assert self.num_grid_models == self.num_expected_models, "Grid search model parameters incorrect or incorrect number of models generated"
def set_glm_startvals():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial')
    mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    mLcoeff = mL.coef()
    r = glm.getGLMRegularizationPath(mL)
    rcoeff = r["coefficients"][0]
    responseMean = d[1].mean()
    initIntercept = math.log(responseMean/(1.0-responseMean))
    startval1 = [0,0,0,0,0,0,0,initIntercept]
    startval2 = [rcoeff["AGE"], rcoeff["RACE"], rcoeff["DPROS"], rcoeff["DCAPS"], rcoeff["PSA"], rcoeff["VOL"], 
                rcoeff["GLEASON"], rcoeff["Intercept"]]
    startvalBad = [0,0]
    
    ml1 = glm(family="binomial", startval = startval1) # same starting condition as GLM
    ml1.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    ml1Coeff = ml1.coef()
    pyunit_utils.assertEqualCoeffDicts(mLcoeff, ml1Coeff , tol = 1e-6) # coeffs should be the same

    ml2 = glm(family="binomial", startval = startval2) # different starting condition from GLM
    ml2.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    ml2Coeff = ml2.coef()   
    
    try:
        pyunit_utils.assertEqualCoeffDicts(mLcoeff, ml2Coeff , tol = 1e-6)
        assert False, "Should have thrown an error as coefficients are different!"        
    except Exception as ex:
        print(ex)
    
    try:
        mlbad =  glm(family="binomial", startval = startvalBad)
        mlbad.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
        assert False, "Should have thrown an error with bad GLM initial values!"
    except Exception as ex:
        print(ex)
        print("Test completed!  Success!")
def test_gam_model_predict():
    train = h2o.import_file(pyunit_utils.locate("bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv"))
    valid = h2o.import_file(pyunit_utils.locate("bigdata/laptop/model_selection/backwardBinomial200C50KRows.csv"))

    x = ["C1","C2","C3"]   
    y = "response"
    train[y] = train[y].asfactor()
    valid[y] = valid[y].asfactor()
    # build model with cross validation and with validation dataset
    gam_model_valid = H2OGeneralizedAdditiveEstimator(family='binomial', solver='IRLSM', gam_columns=["C4"],
                                                      scale = [0.0001], num_knots=[5], standardize=True, nfolds=2,
                                                      fold_assignment = 'modulo', alpha=[0.9,0.5,0.1], lambda_search=True,
                                                      nlambdas=5, max_iterations=3, bs=[2], seed=12345)
    gam_model_valid.train(x, y, training_frame=train, validation_frame=valid)
    # build model with cross validation and no validation dataset
    gam_model = H2OGeneralizedAdditiveEstimator(family='binomial', solver='IRLSM', gam_columns=["C4"],
                                                scale = [0.0001], num_knots=[5], standardize=True, nfolds=2,
                                                fold_assignment = 'modulo', alpha=[0.9,0.5,0.1], lambda_search=True,
                                                nlambdas=5, max_iterations=3, bs=[2], seed=12345)
    gam_model.train(x, y, training_frame=train)
    # model should yield the same coefficients in both case
    gam_model_coef = gam_model.coef()
    gam_model_valid_coef = gam_model_valid.coef()
    pyunit_utils.assertEqualCoeffDicts(gam_model_coef, gam_model_valid_coef)
def glm_alpha_lambda_arrays():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial',
             Lambda=[0.9, 0.5, 0.1],
             alpha=[0.1, 0.5, 0.9],
             solver='COORDINATE_DESCENT',
             cold_start=False)
    mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    m2 = glm.makeGLMModel(model=mL,
                          coefs=r['coefficients'][best_submodel_index])
    dev1 = r['explained_deviance_train'][best_submodel_index]
    p2 = m2.model_performance(d)
    dev2 = 1 - p2.residual_deviance() / p2.null_deviance()
    print(dev1, " =?= ", dev2)
    assert abs(dev1 - dev2) < 1e-6
    responseMean = d[1].mean()
    initIntercept = math.log(responseMean / (1.0 - responseMean))
    startValInit = [0, 0, 0, 0, 0, 0, 0, initIntercept]
    startVal = [0, 0, 0, 0, 0, 0, 0, initIntercept]
    orderedCoeffNames = [
        "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept"
    ]
    for l in range(0, len(r['lambdas'])):
        m = glm(family='binomial',
                alpha=[r['alphas'][l]],
                Lambda=[r['lambdas'][l]],
                solver='COORDINATE_DESCENT',
                startval=startVal)
        m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
        mr = glm.getGLMRegularizationPath(m)

        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-3)
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), 1e-3)
        if (l + 1) < len(
                r['lambdas']) and r['alphas'][l] != r['alphas'][l + 1]:
            startVal = startValInit
        else:
            startVal = pyunit_utils.extractNextCoeff(
                cs_norm, orderedCoeffNames,
                startVal)  # prepare startval for next round

        p = m.model_performance(d)
        devm = 1 - p.residual_deviance() / p.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-4)
        if (l == best_submodel_index
            ):  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["training_metrics"],
                mL._model_json["output"]["training_metrics"],
                tol=1e-4)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \
                                                                    "residual_deviance()!"
Example #17
0
def test_multinomial_alpha():
    col_list_compare = [
        "iterations", "objective", "negative_log_likelihood",
        "training_logloss", "validation_logloss",
        "training_classification_error", "validation_classification_error"
    ]
    print("Preparing dataset....")
    h2o_data = h2o.import_file(
        pyunit_utils.locate(
            "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv"
        ))
    h2o_data["C1"] = h2o_data["C1"].asfactor()
    h2o_data["C2"] = h2o_data["C2"].asfactor()
    h2o_data["C3"] = h2o_data["C3"].asfactor()
    h2o_data["C4"] = h2o_data["C4"].asfactor()
    h2o_data["C5"] = h2o_data["C5"].asfactor()
    h2o_data["C11"] = h2o_data["C11"].asfactor()
    splits_frames = h2o_data.split_frame(ratios=[.8], seed=1234)
    training_data = splits_frames[0]
    test_data = splits_frames[1]
    X = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    Y = "C11"

    print("Building model with score_each_iteration turned on.")
    # test with lambda search on, generate_scoring_history on and off
    model1 = glm(family="multinomial",
                 alpha=[0, 0.2, 0.5, 0.8, 1],
                 lambda_search=True,
                 generate_scoring_history=True)
    model1.train(x=X,
                 y=Y,
                 training_frame=training_data,
                 validation_frame=test_data)
    model2 = glm(family="multinomial",
                 alpha=[0, 0.2, 0.5, 0.8, 1],
                 lambda_search=True,
                 generate_scoring_history=True)
    model2.train(x=X,
                 y=Y,
                 training_frame=training_data,
                 validation_frame=test_data)
    coef1 = model1.coef()
    coef2 = model2.coef()
    for key in coef1.keys():
        pyunit_utils.assertEqualCoeffDicts(coef1[key], coef2[key], tol=1e-6)

    # test with lambda search off, generate_scoring_history on and off
    model1 = glm(family="multinomial",
                 alpha=[0, 0.2, 0.5, 0.8, 1],
                 lambda_search=False,
                 generate_scoring_history=True,
                 Lambda=[0, 0.1, 0.01, 0.001])
    model1.train(x=X,
                 y=Y,
                 training_frame=training_data,
                 validation_frame=test_data)
    model2 = glm(family="multinomial",
                 alpha=[0, 0.2, 0.5, 0.8, 1],
                 lambda_search=False,
                 generate_scoring_history=True,
                 Lambda=[0, 0.1, 0.01, 0.001])
    model2.train(x=X,
                 y=Y,
                 training_frame=training_data,
                 validation_frame=test_data)
    coef1 = model1.coef()
    coef2 = model2.coef()
    for key in coef1.keys():
        pyunit_utils.assertEqualCoeffDicts(coef1[key], coef2[key], tol=1e-6)

    # test with lambda search on, generate_scoring_history on and off, cv on
    model1 = glm(family="multinomial",
                 alpha=[0, 0.2, 0.5, 0.8, 1],
                 lambda_search=True,
                 generate_scoring_history=True,
                 nfolds=2,
                 seed=12345)
    model1.train(x=X,
                 y=Y,
                 training_frame=training_data,
                 validation_frame=test_data)
    model2 = glm(family="multinomial",
                 alpha=[0, 0.2, 0.5, 0.8, 1],
                 lambda_search=True,
                 generate_scoring_history=True,
                 nfolds=2,
                 seed=12345)
    model2.train(x=X,
                 y=Y,
                 training_frame=training_data,
                 validation_frame=test_data)
    coef1 = model1.coef()
    coef2 = model2.coef()
    for key in coef1.keys():
        pyunit_utils.assertEqualCoeffDicts(coef1[key], coef2[key], tol=1e-6)

    # test with lambda search off, generate_scoring_history on and off, cv on
    model1 = glm(family="multinomial",
                 alpha=[0, 0.2, 0.5, 0.8, 1],
                 lambda_search=False,
                 generate_scoring_history=True,
                 nfolds=2,
                 seed=12345,
                 Lambda=[0, 0.1, 0.01, 0.001])
    model1.train(x=X,
                 y=Y,
                 training_frame=training_data,
                 validation_frame=test_data)
    model2 = glm(family="multinomial",
                 alpha=[0, 0.2, 0.5, 0.8, 1],
                 lambda_search=False,
                 generate_scoring_history=True,
                 nfolds=2,
                 seed=12345,
                 Lambda=[0, 0.1, 0.01, 0.001])
    model2.train(x=X,
                 y=Y,
                 training_frame=training_data,
                 validation_frame=test_data)
    coef1 = model1.coef()
    coef2 = model2.coef()
    for key in coef1.keys():
        pyunit_utils.assertEqualCoeffDicts(coef1[key], coef2[key], tol=1e-6)