def test_makeGLMModel():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]
    m = glm(family='gaussian', Lambda=[0.001], alpha=[0.5])
    m.train(training_frame=d, x=myX, y=myY)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0])
    f1 = m.predict(d)  # predict with original model
    f2 = m2.predict(d)  # predict with model out of makeGLMModel
    pyunit_utils.compare_frames_local(f1, f2, prob=1)
    coefs = r['coefficients'][0]
    coefs['wendy_dreams'] = 8

    try:
        glm.makeGLMModel(model=m, coefs=coefs)
        assert False, "Should have throw exception of bad coefficient length"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 9 is different from coefficient provided by user ") in temp, \
            "Wrong exception was received."
        print("coefficient test passed!")
def test_glm_multinomial_makeGLMModel():
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data"))
    mL = glm(family='multinomial', alpha=[0.1], Lambda=[0.9])
    d[54] = d[54].asfactor()
    mL.train(training_frame=d, x=list(range(0, 54)), y=54)
    r = glm.getGLMRegularizationPath(mL)
    rank = check_nonzero_coefs(r['coefficients'][0])
    assert rank == mL._model_json["output"]["rank"], "expected rank: {0}, actual rank: {1}." \
                                                     "".format(rank, mL._model_json["output"]["rank"])
    m2 = glm.makeGLMModel(
        model=mL, coefs=r['coefficients']
        [0])  # model generated from setting coefficients to model
    f1 = mL.predict(d)
    f2 = m2.predict(d)
    pyunit_utils.compare_frames_local(f1, f2, prob=1)

    coefs = r['coefficients'][0]
    coefs[
        "wendy_dreams"] = 0.123  # add extra coefficients to model coefficient

    try:
        glm.makeGLMModel(model=mL, coefs=coefs)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
           ("model coefficient length 371 is different from coefficient provided by user") in temp, \
            "Wrong exception was received."
        print("glm Multinomial makeGLMModel test completed!")
def testOrdinalLogit():
    Dtrain = h2o.import_file(
        pyunit_utils.locate(
            "bigdata/laptop/glm_ordinal_logit/ordinal_ordinal_20_training_set.csv"
        ))
    Dtrain["C21"] = Dtrain["C21"].asfactor()

    print("Fit model on dataset")
    model = glm(family="ordinal",
                alpha=[0.5],
                lambda_=[0.001],
                max_iterations=1000,
                beta_epsilon=1e-8,
                objective_epsilon=1e-8)
    model.train(x=list(range(0, 20)), y="C21", training_frame=Dtrain)
    predH2O = model.predict(Dtrain)
    r = glm.getGLMRegularizationPath(model)
    m2 = glm.makeGLMModel(
        model=model, coefs=r['coefficients']
        [0])  # model generated from setting coefficients to model
    f2 = m2.predict(Dtrain)
    pyunit_utils.compare_frames_local(predH2O, f2, prob=1)
    coefs = r['coefficients'][0]
    coefs['h2o_dream'] = 3.1415

    try:
        glm.makeGLMModel(model=model, coefs=coefs)
        assert False, "Should have thrown an exception!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 189 is different from coefficient provided by user ") in temp, \
            "Wrong exception was received."
        print("coefficient test passed!")
def test_makeGLMModel():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial',
            Lambda=[0.001],
            alpha=[0.5],
            solver='COORDINATE_DESCENT')
    m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0])
    f1 = m.predict(d)  # predict with original model
    f2 = m2.predict(d)  # predict with model out of makeGLMModel
    pyunit_utils.compare_frames_local(f1[1], f2[1], prob=1)
    coefs = r['coefficients'][0]
    coefs['wendy_dreams'] = 8

    try:
        glm.makeGLMModel(model=m, coefs=coefs)
        assert False, "Test failed: should have throw exception of bad coefficient length!"
    except Exception as ex:
        print(ex)
        temp = str(ex)
        assert ("Server error java.lang.IllegalArgumentException:" in temp) and \
               ("model coefficient length 8 is different from coefficient provided by user ") in temp,\
            "Wrong exception was received."
        print("makeGLMModel test passed!")
def reg_path_glm():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial', lambda_search=True, solver='COORDINATE_DESCENT')
    m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][10])
    dev1 = r['explained_deviance_train'][10]
    p = m2.model_performance(d)
    dev2 = 1 - p.residual_deviance() / p.null_deviance()
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0, len(r['lambdas'])):
        m = glm(family='binomial',
                lambda_search=False,
                Lambda=r['lambdas'][l],
                solver='COORDINATE_DESCENT')
        m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        diff = 0
        diff2 = 0
        for n in cs.keys():
            diff = max(diff, abs((cs[n] - m.coef()[n])))
            diff2 = max(diff2, abs((cs_norm[n] - m.coef_norm()[n])))
        print(diff)
        print(diff2)
        assert diff < 1e-2
        assert diff2 < 1e-2
        p = m.model_performance(d)
        devm = 1 - p.residual_deviance() / p.null_deviance()
        devn = r['explained_deviance_train'][l]
        print(devm)
        print(devn)
        assert abs(devm - devn) < 1e-4
def reg_path_glm():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    m = glm(family='binomial',lambda_search=True,solver='COORDINATE_DESCENT')
    m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    r = glm.getGLMRegularizationPath(m)
    m2 = glm.makeGLMModel(model=m,coefs=r['coefficients'][10])
    dev1 = r['explained_deviance_train'][10]
    p = m2.model_performance(d)
    dev2 = 1-p.residual_deviance()/p.null_deviance()
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0,len(r['lambdas'])):
        m = glm(family='binomial',lambda_search=False,Lambda=r['lambdas'][l],solver='COORDINATE_DESCENT')
        m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        diff = 0
        diff2 = 0
        for n in cs.keys():
            diff = max(diff,abs((cs[n] - m.coef()[n])))
            diff2 = max(diff2,abs((cs_norm[n] - m.coef_norm()[n])))
        print(diff)
        print(diff2)
        assert diff < 1e-2
        assert diff2 < 1e-2
        p = m.model_performance(d)
        devm = 1-p.residual_deviance()/p.null_deviance()
        devn = r['explained_deviance_train'][l]
        print(devm)
        print(devn)
        assert abs(devm - devn) < 1e-4
Beispiel #7
0
def glm_alpha_array_lambda_null():
    # first test: compare coefficients and deviance
    d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial',alpha=[0.1,0.5,0.9],solver='COORDINATE_DESCENT')
    mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = ["alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train"]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    m2 = glm.makeGLMModel(model=mL,coefs=r['coefficients'][best_submodel_index])
    dev1 = r['explained_deviance_train'][best_submodel_index]
    p2 = m2.model_performance(d)
    dev2 = 1-p2.residual_deviance()/p2.null_deviance()
    print(dev1," =?= ",dev2)
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0,len(r['lambdas'])):
        m = glm(family='binomial',alpha=[r['alphas'][l]],Lambda=[r['lambdas'][l]],solver='COORDINATE_DESCENT')
        m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1)
        mr = glm.getGLMRegularizationPath(m)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef())
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm())
        p = m.model_performance(d)
        devm = 1-p.residual_deviance()/p.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr)
        if (l == best_submodel_index): # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(m._model_json["output"]["training_metrics"],
                                                 mL._model_json["output"]["training_metrics"])
        else: # for other submodel, should have worse residual_deviance() than best submodel
            assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \
                                                                    "residual_deviance()!"
def test_negBinomial_makeGLMModel():
    print("Read in prostate data.")
    h2o_data = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/prostate/prostate_complete.csv.zip"))
    print("Testing for family: Negative Binomial")
    print("Set variables for h2o.")
    myY = "GLEASON"
    myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"]

    thetas = [0.000000001, 0.01, 0.1, 0.5, 1]
    for thetaO in thetas:
        h2o_model_log = H2OGeneralizedLinearEstimator(
            family="negativebinomial",
            link="log",
            alpha=0.5,
            Lambda=0.0001,
            theta=thetaO)
        h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data)
        predictModel = h2o_model_log.predict(h2o_data)
        r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(
            h2o_model_log)
        makeModel = H2OGeneralizedLinearEstimator.makeGLMModel(
            model=h2o_model_log, coefs=r['coefficients']
            [0])  # model generated from setting coefficients to model
        predictMake = makeModel.predict(h2o_data)
        pyunit_utils.compare_frames_local(predictModel, predictMake, prob=1)
def glm_alpha_lambda_arrays():
    # compare coefficients and deviance when only training dataset is available
    train = h2o.import_file(path=pyunit_utils.locate(
        "smalldata/glm_test/binomial_20_cols_10KRows.csv"))
    for ind in range(10):
        train[ind] = train[ind].asfactor()
    train["C21"] = train["C21"].asfactor()
    frames = train.split_frame(ratios=[0.8], seed=12345)
    d = frames[0]
    d_test = frames[1]
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]

    # compare results when validation dataset is present
    mLVal = glm(family='binomial',
                alpha=[0.1, 0.5],
                lambda_search=True,
                solver='COORDINATE_DESCENT',
                nlambdas=3)  # train with validations set
    mLVal.train(training_frame=d,
                x=list(range(20)),
                y=20,
                validation_frame=d_test)
    rVal = glm.getGLMRegularizationPath(mLVal)
    best_submodel_indexVal = mLVal._model_json["output"]["best_submodel_index"]
    m2Val = glm.makeGLMModel(
        model=mLVal, coefs=rVal['coefficients'][best_submodel_indexVal])
    dev1Val = rVal['explained_deviance_valid'][best_submodel_indexVal]
    p2Val = m2Val.model_performance(d_test)
    dev2Val = 1 - p2Val.residual_deviance() / p2Val.null_deviance()
    assert abs(dev1Val - dev2Val) < 1e-6
    for l in range(0, len(rVal['lambdas'])):
        m = glm(family='binomial',
                alpha=[rVal['alphas'][l]],
                Lambda=rVal['lambdas'][l],
                solver='COORDINATE_DESCENT')
        m.train(training_frame=d,
                x=list(range(20)),
                y=20,
                validation_frame=d_test)
        mr = glm.getGLMRegularizationPath(m)
        p = m.model_performance(d_test)
        cs = rVal['coefficients'][l]
        cs_norm = rVal['coefficients_std'][l]
        print("Comparing submodel index {0}".format(l))
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-1)
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-1)
        pyunit_utils.assertEqualRegPaths(regKeys, rVal, l, mr, tol=1e-3)
        dVal = 1 - p.residual_deviance() / p.null_deviance()
        if l == best_submodel_indexVal:  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["validation_metrics"],
                mLVal._model_json["output"]["validation_metrics"],
                tol=1e-2)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert dVal <= dev2Val, "Best submodel does not have highest explained deviance_valid for submodel: !".format(
                l)
Beispiel #10
0
def generate_dataset(family, nrow, ncol, realFrac, intFrac, enumFrac,
                     missingFrac, factorRange, numericRange, targetFactor):
    if family == "binomial":
        responseFactor = 2
    elif family == 'multinomial' or family == 'ordinal':
        responseFactor = targetFactor
    else:
        responseFactor = 1

    trainData = random_dataset(nrow,
                               ncol,
                               realFrac=realFrac,
                               intFrac=intFrac,
                               enumFrac=enumFrac,
                               factorR=factorRange,
                               integerR=numericRange,
                               responseFactor=responseFactor,
                               misFrac=missingFrac)
    if family == 'poisson':
        trainData['response'] = trainData['response'] + numericRange
    myX = trainData.names
    myY = 'response'
    myX.remove(myY)

    m = glm(
        family=family,
        max_iterations=1,
        interactions=["C1", "C2"],
        tweedie_link_power=2,
        tweedie_variance_power=0.4,
    )
    m.train(training_frame=trainData, x=myX, y=myY)
    r = glm.getGLMRegularizationPath(m)
    coeffDict = r['coefficients'][0]
    coeffLen = len(coeffDict)
    randCoeffVals = np.random.uniform(low=-3, high=3, size=coeffLen).tolist()
    keyset = coeffDict.keys()
    count = 0
    for key in keyset:
        coeffDict[key] = randCoeffVals[count]
        count = count + 1

    m2 = glm.makeGLMModel(
        model=m,
        coefs=coeffDict)  # model generated from setting coefficients to model
    f2 = m2.predict(trainData)

    finalDataset = trainData[myX]
    finalDataset = finalDataset.cbind(f2[0])
    finalDataset.set_name(col=finalDataset.ncols - 1, name='response')

    return finalDataset
def glm_alpha_lambda_arrays():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial',
             Lambda=[0.9, 0.5, 0.1],
             alpha=[0.1, 0.5, 0.9],
             solver='COORDINATE_DESCENT')
    mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    m2 = glm.makeGLMModel(model=mL,
                          coefs=r['coefficients'][best_submodel_index])
    dev1 = r['explained_deviance_train'][best_submodel_index]
    p2 = m2.model_performance(d)
    dev2 = 1 - p2.residual_deviance() / p2.null_deviance()
    assert abs(dev1 - dev2) < 1e-6
    for l in range(0, len(r['lambdas'])):
        m = glm(family='binomial',
                alpha=[r['alphas'][l]],
                Lambda=[r['lambdas'][l]],
                solver='COORDINATE_DESCENT')
        m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
        mr = glm.getGLMRegularizationPath(m)
        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        diff = 0
        diff2 = 0
        for n in cs.keys():
            diff = max(diff, abs((cs[n] - m.coef()[n])))
            diff2 = max(diff2, abs((cs_norm[n] - m.coef_norm()[n])))
        assert diff < 1e-2
        assert diff2 < 1e-2
        p = m.model_performance(d)
        devm = 1 - p.residual_deviance() / p.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-5)
        if (l == best_submodel_index
            ):  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["training_metrics"],
                mL._model_json["output"]["training_metrics"],
                tol=1e-5)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \
                                                                    "residual_deviance()!"
def glm_alpha_lambda_arrays():
    # read in the dataset and construct training set (and validation set)
    d = h2o.import_file(
        path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    mL = glm(family='binomial',
             Lambda=[0.9, 0.5, 0.1],
             alpha=[0.1, 0.5, 0.9],
             solver='COORDINATE_DESCENT',
             cold_start=False)
    mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
    r = glm.getGLMRegularizationPath(mL)
    regKeys = [
        "alphas", "lambdas", "explained_deviance_valid",
        "explained_deviance_train"
    ]
    best_submodel_index = mL._model_json["output"]["best_submodel_index"]
    m2 = glm.makeGLMModel(model=mL,
                          coefs=r['coefficients'][best_submodel_index])
    dev1 = r['explained_deviance_train'][best_submodel_index]
    p2 = m2.model_performance(d)
    dev2 = 1 - p2.residual_deviance() / p2.null_deviance()
    print(dev1, " =?= ", dev2)
    assert abs(dev1 - dev2) < 1e-6
    responseMean = d[1].mean()
    initIntercept = math.log(responseMean / (1.0 - responseMean))
    startValInit = [0, 0, 0, 0, 0, 0, 0, initIntercept]
    startVal = [0, 0, 0, 0, 0, 0, 0, initIntercept]
    orderedCoeffNames = [
        "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept"
    ]
    for l in range(0, len(r['lambdas'])):
        m = glm(family='binomial',
                alpha=[r['alphas'][l]],
                Lambda=[r['lambdas'][l]],
                solver='COORDINATE_DESCENT',
                startval=startVal)
        m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1)
        mr = glm.getGLMRegularizationPath(m)

        cs = r['coefficients'][l]
        cs_norm = r['coefficients_std'][l]
        pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-3)
        pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), 1e-3)
        if (l + 1) < len(
                r['lambdas']) and r['alphas'][l] != r['alphas'][l + 1]:
            startVal = startValInit
        else:
            startVal = pyunit_utils.extractNextCoeff(
                cs_norm, orderedCoeffNames,
                startVal)  # prepare startval for next round

        p = m.model_performance(d)
        devm = 1 - p.residual_deviance() / p.null_deviance()
        devn = r['explained_deviance_train'][l]
        assert abs(devm - devn) < 1e-4
        pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-4)
        if (l == best_submodel_index
            ):  # check training metrics, should equal for best submodel index
            pyunit_utils.assertEqualModelMetrics(
                m._model_json["output"]["training_metrics"],
                mL._model_json["output"]["training_metrics"],
                tol=1e-4)
        else:  # for other submodel, should have worse residual_deviance() than best submodel
            assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \
                                                                    "residual_deviance()!"