def test_makeGLMModel(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] m = glm(family='gaussian', Lambda=[0.001], alpha=[0.5]) m.train(training_frame=d, x=myX, y=myY) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0]) f1 = m.predict(d) # predict with original model f2 = m2.predict(d) # predict with model out of makeGLMModel pyunit_utils.compare_frames_local(f1, f2, prob=1) coefs = r['coefficients'][0] coefs['wendy_dreams'] = 8 try: glm.makeGLMModel(model=m, coefs=coefs) assert False, "Should have throw exception of bad coefficient length" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 9 is different from coefficient provided by user ") in temp, \ "Wrong exception was received." print("coefficient test passed!")
def test_glm_multinomial_makeGLMModel(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) mL = glm(family='multinomial', alpha=[0.1], Lambda=[0.9]) d[54] = d[54].asfactor() mL.train(training_frame=d, x=list(range(0, 54)), y=54) r = glm.getGLMRegularizationPath(mL) rank = check_nonzero_coefs(r['coefficients'][0]) assert rank == mL._model_json["output"]["rank"], "expected rank: {0}, actual rank: {1}." \ "".format(rank, mL._model_json["output"]["rank"]) m2 = glm.makeGLMModel( model=mL, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model f1 = mL.predict(d) f2 = m2.predict(d) pyunit_utils.compare_frames_local(f1, f2, prob=1) coefs = r['coefficients'][0] coefs[ "wendy_dreams"] = 0.123 # add extra coefficients to model coefficient try: glm.makeGLMModel(model=mL, coefs=coefs) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 371 is different from coefficient provided by user") in temp, \ "Wrong exception was received." print("glm Multinomial makeGLMModel test completed!")
def testOrdinalLogit(): Dtrain = h2o.import_file( pyunit_utils.locate( "bigdata/laptop/glm_ordinal_logit/ordinal_ordinal_20_training_set.csv" )) Dtrain["C21"] = Dtrain["C21"].asfactor() print("Fit model on dataset") model = glm(family="ordinal", alpha=[0.5], lambda_=[0.001], max_iterations=1000, beta_epsilon=1e-8, objective_epsilon=1e-8) model.train(x=list(range(0, 20)), y="C21", training_frame=Dtrain) predH2O = model.predict(Dtrain) r = glm.getGLMRegularizationPath(model) m2 = glm.makeGLMModel( model=model, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model f2 = m2.predict(Dtrain) pyunit_utils.compare_frames_local(predH2O, f2, prob=1) coefs = r['coefficients'][0] coefs['h2o_dream'] = 3.1415 try: glm.makeGLMModel(model=model, coefs=coefs) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 189 is different from coefficient provided by user ") in temp, \ "Wrong exception was received." print("coefficient test passed!")
def test_makeGLMModel(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial', Lambda=[0.001], alpha=[0.5], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0]) f1 = m.predict(d) # predict with original model f2 = m2.predict(d) # predict with model out of makeGLMModel pyunit_utils.compare_frames_local(f1[1], f2[1], prob=1) coefs = r['coefficients'][0] coefs['wendy_dreams'] = 8 try: glm.makeGLMModel(model=m, coefs=coefs) assert False, "Test failed: should have throw exception of bad coefficient length!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 8 is different from coefficient provided by user ") in temp,\ "Wrong exception was received." print("makeGLMModel test passed!")
def reg_path_glm(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial', lambda_search=True, solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][10]) dev1 = r['explained_deviance_train'][10] p = m2.model_performance(d) dev2 = 1 - p.residual_deviance() / p.null_deviance() assert abs(dev1 - dev2) < 1e-6 for l in range(0, len(r['lambdas'])): m = glm(family='binomial', lambda_search=False, Lambda=r['lambdas'][l], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] diff = 0 diff2 = 0 for n in cs.keys(): diff = max(diff, abs((cs[n] - m.coef()[n]))) diff2 = max(diff2, abs((cs_norm[n] - m.coef_norm()[n]))) print(diff) print(diff2) assert diff < 1e-2 assert diff2 < 1e-2 p = m.model_performance(d) devm = 1 - p.residual_deviance() / p.null_deviance() devn = r['explained_deviance_train'][l] print(devm) print(devn) assert abs(devm - devn) < 1e-4
def reg_path_glm(): # read in the dataset and construct training set (and validation set) d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial',lambda_search=True,solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m,coefs=r['coefficients'][10]) dev1 = r['explained_deviance_train'][10] p = m2.model_performance(d) dev2 = 1-p.residual_deviance()/p.null_deviance() assert abs(dev1 - dev2) < 1e-6 for l in range(0,len(r['lambdas'])): m = glm(family='binomial',lambda_search=False,Lambda=r['lambdas'][l],solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] diff = 0 diff2 = 0 for n in cs.keys(): diff = max(diff,abs((cs[n] - m.coef()[n]))) diff2 = max(diff2,abs((cs_norm[n] - m.coef_norm()[n]))) print(diff) print(diff2) assert diff < 1e-2 assert diff2 < 1e-2 p = m.model_performance(d) devm = 1-p.residual_deviance()/p.null_deviance() devn = r['explained_deviance_train'][l] print(devm) print(devn) assert abs(devm - devn) < 1e-4
def glm_alpha_array_lambda_null(): # first test: compare coefficients and deviance d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial',alpha=[0.1,0.5,0.9],solver='COORDINATE_DESCENT') mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) r = glm.getGLMRegularizationPath(mL) regKeys = ["alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train"] best_submodel_index = mL._model_json["output"]["best_submodel_index"] m2 = glm.makeGLMModel(model=mL,coefs=r['coefficients'][best_submodel_index]) dev1 = r['explained_deviance_train'][best_submodel_index] p2 = m2.model_performance(d) dev2 = 1-p2.residual_deviance()/p2.null_deviance() print(dev1," =?= ",dev2) assert abs(dev1 - dev2) < 1e-6 for l in range(0,len(r['lambdas'])): m = glm(family='binomial',alpha=[r['alphas'][l]],Lambda=[r['lambdas'][l]],solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertEqualCoeffDicts(cs, m.coef()) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm()) p = m.model_performance(d) devm = 1-p.residual_deviance()/p.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr) if (l == best_submodel_index): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics(m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"]) else: # for other submodel, should have worse residual_deviance() than best submodel assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \ "residual_deviance()!"
def test_negBinomial_makeGLMModel(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) print("Testing for family: Negative Binomial") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] thetas = [0.000000001, 0.01, 0.1, 0.5, 1] for thetaO in thetas: h2o_model_log = H2OGeneralizedLinearEstimator( family="negativebinomial", link="log", alpha=0.5, Lambda=0.0001, theta=thetaO) h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data) predictModel = h2o_model_log.predict(h2o_data) r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath( h2o_model_log) makeModel = H2OGeneralizedLinearEstimator.makeGLMModel( model=h2o_model_log, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model predictMake = makeModel.predict(h2o_data) pyunit_utils.compare_frames_local(predictModel, predictMake, prob=1)
def glm_alpha_lambda_arrays(): # compare coefficients and deviance when only training dataset is available train = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/binomial_20_cols_10KRows.csv")) for ind in range(10): train[ind] = train[ind].asfactor() train["C21"] = train["C21"].asfactor() frames = train.split_frame(ratios=[0.8], seed=12345) d = frames[0] d_test = frames[1] regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] # compare results when validation dataset is present mLVal = glm(family='binomial', alpha=[0.1, 0.5], lambda_search=True, solver='COORDINATE_DESCENT', nlambdas=3) # train with validations set mLVal.train(training_frame=d, x=list(range(20)), y=20, validation_frame=d_test) rVal = glm.getGLMRegularizationPath(mLVal) best_submodel_indexVal = mLVal._model_json["output"]["best_submodel_index"] m2Val = glm.makeGLMModel( model=mLVal, coefs=rVal['coefficients'][best_submodel_indexVal]) dev1Val = rVal['explained_deviance_valid'][best_submodel_indexVal] p2Val = m2Val.model_performance(d_test) dev2Val = 1 - p2Val.residual_deviance() / p2Val.null_deviance() assert abs(dev1Val - dev2Val) < 1e-6 for l in range(0, len(rVal['lambdas'])): m = glm(family='binomial', alpha=[rVal['alphas'][l]], Lambda=rVal['lambdas'][l], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=list(range(20)), y=20, validation_frame=d_test) mr = glm.getGLMRegularizationPath(m) p = m.model_performance(d_test) cs = rVal['coefficients'][l] cs_norm = rVal['coefficients_std'][l] print("Comparing submodel index {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-1) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-1) pyunit_utils.assertEqualRegPaths(regKeys, rVal, l, mr, tol=1e-3) dVal = 1 - p.residual_deviance() / p.null_deviance() if l == best_submodel_indexVal: # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["validation_metrics"], mLVal._model_json["output"]["validation_metrics"], tol=1e-2) else: # for other submodel, should have worse residual_deviance() than best submodel assert dVal <= dev2Val, "Best submodel does not have highest explained deviance_valid for submodel: !".format( l)
def generate_dataset(family, nrow, ncol, realFrac, intFrac, enumFrac, missingFrac, factorRange, numericRange, targetFactor): if family == "binomial": responseFactor = 2 elif family == 'multinomial' or family == 'ordinal': responseFactor = targetFactor else: responseFactor = 1 trainData = random_dataset(nrow, ncol, realFrac=realFrac, intFrac=intFrac, enumFrac=enumFrac, factorR=factorRange, integerR=numericRange, responseFactor=responseFactor, misFrac=missingFrac) if family == 'poisson': trainData['response'] = trainData['response'] + numericRange myX = trainData.names myY = 'response' myX.remove(myY) m = glm( family=family, max_iterations=1, interactions=["C1", "C2"], tweedie_link_power=2, tweedie_variance_power=0.4, ) m.train(training_frame=trainData, x=myX, y=myY) r = glm.getGLMRegularizationPath(m) coeffDict = r['coefficients'][0] coeffLen = len(coeffDict) randCoeffVals = np.random.uniform(low=-3, high=3, size=coeffLen).tolist() keyset = coeffDict.keys() count = 0 for key in keyset: coeffDict[key] = randCoeffVals[count] count = count + 1 m2 = glm.makeGLMModel( model=m, coefs=coeffDict) # model generated from setting coefficients to model f2 = m2.predict(trainData) finalDataset = trainData[myX] finalDataset = finalDataset.cbind(f2[0]) finalDataset.set_name(col=finalDataset.ncols - 1, name='response') return finalDataset
def glm_alpha_lambda_arrays(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial', Lambda=[0.9, 0.5, 0.1], alpha=[0.1, 0.5, 0.9], solver='COORDINATE_DESCENT') mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(mL) regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] best_submodel_index = mL._model_json["output"]["best_submodel_index"] m2 = glm.makeGLMModel(model=mL, coefs=r['coefficients'][best_submodel_index]) dev1 = r['explained_deviance_train'][best_submodel_index] p2 = m2.model_performance(d) dev2 = 1 - p2.residual_deviance() / p2.null_deviance() assert abs(dev1 - dev2) < 1e-6 for l in range(0, len(r['lambdas'])): m = glm(family='binomial', alpha=[r['alphas'][l]], Lambda=[r['lambdas'][l]], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] diff = 0 diff2 = 0 for n in cs.keys(): diff = max(diff, abs((cs[n] - m.coef()[n]))) diff2 = max(diff2, abs((cs_norm[n] - m.coef_norm()[n]))) assert diff < 1e-2 assert diff2 < 1e-2 p = m.model_performance(d) devm = 1 - p.residual_deviance() / p.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-5) if (l == best_submodel_index ): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"], tol=1e-5) else: # for other submodel, should have worse residual_deviance() than best submodel assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \ "residual_deviance()!"
def glm_alpha_lambda_arrays(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial', Lambda=[0.9, 0.5, 0.1], alpha=[0.1, 0.5, 0.9], solver='COORDINATE_DESCENT', cold_start=False) mL.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(mL) regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] best_submodel_index = mL._model_json["output"]["best_submodel_index"] m2 = glm.makeGLMModel(model=mL, coefs=r['coefficients'][best_submodel_index]) dev1 = r['explained_deviance_train'][best_submodel_index] p2 = m2.model_performance(d) dev2 = 1 - p2.residual_deviance() / p2.null_deviance() print(dev1, " =?= ", dev2) assert abs(dev1 - dev2) < 1e-6 responseMean = d[1].mean() initIntercept = math.log(responseMean / (1.0 - responseMean)) startValInit = [0, 0, 0, 0, 0, 0, 0, initIntercept] startVal = [0, 0, 0, 0, 0, 0, 0, initIntercept] orderedCoeffNames = [ "AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept" ] for l in range(0, len(r['lambdas'])): m = glm(family='binomial', alpha=[r['alphas'][l]], Lambda=[r['lambdas'][l]], solver='COORDINATE_DESCENT', startval=startVal) m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-3) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), 1e-3) if (l + 1) < len( r['lambdas']) and r['alphas'][l] != r['alphas'][l + 1]: startVal = startValInit else: startVal = pyunit_utils.extractNextCoeff( cs_norm, orderedCoeffNames, startVal) # prepare startval for next round p = m.model_performance(d) devm = 1 - p.residual_deviance() / p.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr, tol=1e-4) if (l == best_submodel_index ): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"], tol=1e-4) else: # for other submodel, should have worse residual_deviance() than best submodel assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \ "residual_deviance()!"