def reg_path_glm(): # read in the dataset and construct training set (and validation set) d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial',lambda_search=True,solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m,coefs=r['coefficients'][10]) dev1 = r['explained_deviance_train'][10] p = m2.model_performance(d) dev2 = 1-p.residual_deviance()/p.null_deviance() assert abs(dev1 - dev2) < 1e-6 for l in range(0,len(r['lambdas'])): m = glm(family='binomial',lambda_search=False,Lambda=r['lambdas'][l],solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] diff = 0 diff2 = 0 for n in cs.keys(): diff = max(diff,abs((cs[n] - m.coef()[n]))) diff2 = max(diff2,abs((cs_norm[n] - m.coef_norm()[n]))) print(diff) print(diff2) assert diff < 1e-2 assert diff2 < 1e-2 p = m.model_performance(d) devm = 1-p.residual_deviance()/p.null_deviance() devn = r['explained_deviance_train'][l] print(devm) print(devn) assert abs(devm - devn) < 1e-4
def reg_path_glm(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial', lambda_search=True, solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][10]) dev1 = r['explained_deviance_train'][10] p = m2.model_performance(d) dev2 = 1 - p.residual_deviance() / p.null_deviance() assert abs(dev1 - dev2) < 1e-6 for l in range(0, len(r['lambdas'])): m = glm(family='binomial', lambda_search=False, Lambda=r['lambdas'][l], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] diff = 0 diff2 = 0 for n in cs.keys(): diff = max(diff, abs((cs[n] - m.coef()[n]))) diff2 = max(diff2, abs((cs_norm[n] - m.coef_norm()[n]))) print(diff) print(diff2) assert diff < 1e-2 assert diff2 < 1e-2 p = m.model_performance(d) devm = 1 - p.residual_deviance() / p.null_deviance() devn = r['explained_deviance_train'][l] print(devm) print(devn) assert abs(devm - devn) < 1e-4
def shuffling_large(): print("Reading in Arcene training data for binomial modeling.") train_data = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene.csv")) train_data_shuffled = h2o.upload_file(path=pyunit_utils.locate("smalldata/arcene/shuffle_test_version/arcene_shuffled.csv")) print("Create model on original Arcene dataset.") h2o_model = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model.train(x=list(range(1000)), y=1000, training_frame=train_data) print("Create second model on original Arcene dataset.") h2o_model_2 = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model_2.train(x=list(range(1000)), y=1000, training_frame=train_data) print("Create model on shuffled Arcene dataset.") h2o_model_s = H2OGeneralizedLinearEstimator(family="binomial", lambda_search=True, alpha=0.5) h2o_model_s.train(x=list(range(1000)), y=1000, training_frame=train_data_shuffled) print("Assert that number of predictors remaining and their respective coefficients are equal.") for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_2. _model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal" for x, y in zip(h2o_model._model_json['output']['coefficients_table'].cell_values,h2o_model_s. _model_json['output']['coefficients_table'].cell_values): assert (type(x[1]) == type(y[1])) and (type(x[2]) == type(y[2])), "coefficients should be the same type" if isinstance(x[1],float): assert abs(x[1] - y[1]) < 5e-10, "coefficients should be equal" if isinstance(x[2],float): assert abs(x[2] - y[2]) < 5e-10, "coefficients should be equal"
def test_max_iterations_dispersion(): training_data = h2o.import_file( "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/gamma_dispersion_factor_9_10kRows.csv" ) Y = 'resp' x = ['abs.C1.', 'abs.C2.', 'abs.C3.', 'abs.C4.', 'abs.C5.'] model_short = H2OGeneralizedLinearEstimator(family='gamma', lambda_=0, compute_p_values=True, dispersion_factor_method="ml", max_iterations_dispersion=1) model_short.train(training_frame=training_data, x=x, y=Y) model_long = H2OGeneralizedLinearEstimator( family='gamma', lambda_=0, compute_p_values=True, dispersion_factor_method="ml", max_iterations_dispersion=1000000) model_long.train(training_frame=training_data, x=x, y=Y) true_dispersion = 9 # check model with more iterations should generate dispersion parameters closer to the true dispersion value assert abs(model_short._model_json["output"]["dispersion"]-true_dispersion) > \ abs(model_long._model_json["output"]["dispersion"]-true_dispersion), \ " Model with more iterations should generate better dispersion parameter estimate but did not."
def interactions_airlines(): airlines = h2o.import_file( pyunit_utils.locate("smalldata/airlines/allyears2k_headers.zip")) interaction_pairs = [("CRSDepTime", "UniqueCarrier"), ("CRSDepTime", "Origin"), ("UniqueCarrier", "Origin")] y = 'IsDepDelayed' model = H2OGeneralizedLinearEstimator( family="Binomial", interaction_pairs=interaction_pairs, ) model.train(y=y, training_frame=airlines) MOJONAME = pyunit_utils.getMojoName(model._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) os.mkdir(TMPDIR) try: model.download_mojo(path=TMPDIR) assert False, "Download MOJO should fail." except H2OValueError as e: assert "Export to MOJO not supported" in e.args[0] try: model.download_pojo(path=TMPDIR) assert False, "Download POJO should fail." except H2OValueError as e: assert 'Export to POJO not supported' in e.args[0] # should work without interaction pairs model = H2OGeneralizedLinearEstimator(family="Binomial") model.train(y=y, training_frame=airlines) model.download_mojo(path=TMPDIR) model.download_pojo(path=TMPDIR)
def test_dispersion_epsilon(): training_data = h2o.import_file( "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/gamma_dispersion_factor_9_10kRows.csv" ) Y = 'resp' x = ['abs.C1.', 'abs.C2.', 'abs.C3.', 'abs.C4.', 'abs.C5.'] model = H2OGeneralizedLinearEstimator(family='gamma', lambda_=0, compute_p_values=True, dispersion_factor_method="ml") model.train(training_frame=training_data, x=x, y=Y) model_short = H2OGeneralizedLinearEstimator(family='gamma', lambda_=0, compute_p_values=True, dispersion_factor_method="ml", dispersion_epsilon=1e-1) model_short.train(training_frame=training_data, x=x, y=Y) model_long = H2OGeneralizedLinearEstimator(family='gamma', lambda_=0, compute_p_values=True, dispersion_factor_method="ml", dispersion_epsilon=1e-4) model_long.train(training_frame=training_data, x=x, y=Y) true_dispersion_factor = 9 assert abs(true_dispersion_factor-model_long._model_json["output"]["dispersion"]) <= abs(model_short._model_json["output"]["dispersion"]-true_dispersion_factor), \ "H2O dispersion parameter estimate with epsilon 1r-4 {0} is worse than that of dispersion_epsilon 0.1 {1}. True dispersion parameter is " \ "{2}".format( model_long._model_json["output"]["dispersion"], model_short._model_json["output"]["dispersion"], true_dispersion_factor)
def testOrdinalLogit(): Dtrain = h2o.import_file( pyunit_utils.locate( "bigdata/laptop/glm_ordinal_logit/ordinal_ordinal_20_training_set.csv" )) Dtrain["C21"] = Dtrain["C21"].asfactor() print("Fit model on dataset") model = glm(family="ordinal", alpha=[0.5], lambda_=[0.001], max_iterations=1000, beta_epsilon=1e-8, objective_epsilon=1e-8) model.train(x=list(range(0, 20)), y="C21", training_frame=Dtrain) predH2O = model.predict(Dtrain) r = glm.getGLMRegularizationPath(model) m2 = glm.makeGLMModel( model=model, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model f2 = m2.predict(Dtrain) pyunit_utils.compare_frames_local(predH2O, f2, prob=1) coefs = r['coefficients'][0] coefs['h2o_dream'] = 3.1415 try: glm.makeGLMModel(model=model, coefs=coefs) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 189 is different from coefficient provided by user ") in temp, \ "Wrong exception was received." print("coefficient test passed!")
def test_makeGLMModel(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] m = glm(family='gaussian', Lambda=[0.001], alpha=[0.5]) m.train(training_frame=d, x=myX, y=myY) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0]) f1 = m.predict(d) # predict with original model f2 = m2.predict(d) # predict with model out of makeGLMModel pyunit_utils.compare_frames_local(f1, f2, prob=1) coefs = r['coefficients'][0] coefs['wendy_dreams'] = 8 try: glm.makeGLMModel(model=m, coefs=coefs) assert False, "Should have throw exception of bad coefficient length" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 9 is different from coefficient provided by user ") in temp, \ "Wrong exception was received." print("coefficient test passed!")
def test_makeGLMModel(): # read in the dataset and construct training set (and validation set) d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) m = glm(family='binomial', Lambda=[0.001], alpha=[0.5], solver='COORDINATE_DESCENT') m.train(training_frame=d, x=[2, 3, 4, 5, 6, 7, 8], y=1) r = glm.getGLMRegularizationPath(m) m2 = glm.makeGLMModel(model=m, coefs=r['coefficients'][0]) f1 = m.predict(d) # predict with original model f2 = m2.predict(d) # predict with model out of makeGLMModel pyunit_utils.compare_frames_local(f1[1], f2[1], prob=1) coefs = r['coefficients'][0] coefs['wendy_dreams'] = 8 try: glm.makeGLMModel(model=m, coefs=coefs) assert False, "Test failed: should have throw exception of bad coefficient length!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 8 is different from coefficient provided by user ") in temp,\ "Wrong exception was received." print("makeGLMModel test passed!")
def buildModelCheckpointing(training_frame, x_indices, y_index, family, solver, cold_start): split_frames = training_frame.split_frame(ratios=[0.9], seed=12345) model = H2OGeneralizedLinearEstimator(family=family, max_iterations=3, solver=solver, lambda_search=True, cold_start=cold_start) model.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelCheckpoint = H2OGeneralizedLinearEstimator(family=family, checkpoint=model.model_id, solver=solver, lambda_search=True, cold_start=cold_start) modelCheckpoint.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelLong = H2OGeneralizedLinearEstimator(family=family, solver=solver, lambda_search=True, cold_start=cold_start) modelLong.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(), modelLong.coef(), tol=1e-6)
def glm_alpha_lambda_arrays_cv(): print("Testing glm cross-validation with alpha array, lambda array for binomial models.") h2o_data = h2o.import_file( path=pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # choices made in model_all and model_xval should be the same since they should be using xval metrics model_all = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True) model_all.train(x=myX, y=myY, training_frame = training_data, validation_frame = test_data) model_all_rpath = glm.getGLMRegularizationPath(model_all) model_xval = glm(family="gaussian", Lambda=[0.1,0.5,0.9], alpha=[0.1,0.5,0.9], nfolds=3, cold_start=True) model_xval.train(x=myX, y=myY, training_frame = training_data) model_xval_rpath = glm.getGLMRegularizationPath(model_xval) for l in range(0,len(model_all_rpath['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients'][l], model_xval_rpath['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(model_all_rpath['coefficients_std'][l], model_xval_rpath['coefficients_std'][l], tol=1e-6)
def buildModelCheckpointing(training_frame, x_indices, y_index, family, solver): split_frames = training_frame.split_frame(ratios=[0.9], seed=12345) model = H2OGeneralizedLinearEstimator(family=family, max_iterations=7, solver=solver) model.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelCheckpoint = H2OGeneralizedLinearEstimator(family=family, checkpoint=model.model_id, solver=solver) modelCheckpoint.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) modelLong = H2OGeneralizedLinearEstimator( family=family, solver=solver) # allow to run to completion modelLong.train(training_frame=split_frames[0], x=x_indices, y=y_index, validation_frame=split_frames[1]) pyunit_utils.assertEqualCoeffDicts(modelCheckpoint.coef(), modelLong.coef(), tol=5e-2)
def test_infogram_iris_x_attributes(): """ Test to showcase that we can specify predictors using infogram model """ fr = h2o.import_file(path=pyunit_utils.locate( "smalldata/admissibleml_test/irisROriginal.csv")) target = "Species" fr[target] = fr[target].asfactor() x = fr.names x.remove(target) infogram_model = H2OInfogram( seed=12345, distribution='multinomial' ) # build infogram model with default settings infogram_model.train(x=x, y=target, training_frame=fr) glm_model1 = H2OGeneralizedLinearEstimator(family='multinomial') glm_model1.train(x=infogram_model._extract_x_from_model(), y=target, training_frame=fr) coef1 = glm_model1.coef() glm_model2 = H2OGeneralizedLinearEstimator(family='multinomial') glm_model2.train(x=infogram_model, y=target, training_frame=fr) coef2 = glm_model2.coef() coef_classes = coef1.keys() for key in coef_classes: pyunit_utils.assertCoefDictEqual(coef1[key], coef2[key], tol=1e-6)
def interactions_GLM_Binomial(): pd_df = pd.DataFrame( np.array([[ 0.1, 0.2, 0.3, 0.15, 0.25, 0.35, 0.12, 0.22, 0.32, 0.2, 0.3, 0.15, 0.05 ], ["a", "a", "a", "b", "b", "b", "c", "c", "c", "a", "a", "a", "b"], [ "Red", "Blue", "Green", "Red", "Blue", "Green", "Red", "Blue", "Green", "Blue", "Green", "Red", "Blue" ]]).T, columns=['label', 'categorical_feat', 'categorical_feat2']) h2o_df = h2o.H2OFrame(pd_df, na_strings=["UNKNOWN"]) interaction_pairs = ["categorical_feat", "categorical_feat2"] # build model with and without NA in Frame model0 = H2OGeneralizedLinearEstimator(family="Gaussian", Lambda=0, interactions=interaction_pairs) model0.train(x=["categorical_feat", "categorical_feat2"], y='label', training_frame=h2o_df) model1 = H2OGeneralizedLinearEstimator(family="Gaussian", Lambda=0.001, interactions=interaction_pairs) model1.train(x=["categorical_feat", "categorical_feat2"], y='label', training_frame=h2o_df) model0CoeffLen = 4 + 2 + 2 + 1 # interaction 4 levels, 2 enums 2 levels each plus intercept due to use_all_factor_level=F model1CoeffLen = 9 + 3 + 3 + 1 # interaction 9 levels, 2 enums 3 levels each plus intercept assert len(model0.coef()) == model0CoeffLen, "Lambda=0, Expected coefficient length: {0}, Actual: " \ "{1}".format(model0CoeffLen, len(model0.coef())) assert len(model1.coef()) == model1CoeffLen, "Lambda=0.001, Expected coefficient length: {0}, Actual: " \ "{1}".format(model1CoeffLen, len(model1.coef()))
def grab_lambda_min(): boston = h2o.import_file(pyunit_utils.locate("smalldata/gbm_test/BostonHousing.csv")) # set the predictor names and the response column name predictors = boston.columns[:-1] # set the response column to "medv", the median value of owner-occupied homes in $1000's response = "medv" # convert the chas column to a factor (chas = Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)) boston['chas'] = boston['chas'].asfactor() # split into train and validation sets train, valid = boston.split_frame(ratios = [.8], seed=1234) boston_glm = H2OGeneralizedLinearEstimator(lambda_search = True, seed=1234, cold_start=True) boston_glm.train(x = predictors, y = response, training_frame = train, validation_frame = valid) r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath(boston_glm) for l in range(0,len(r['lambdas'])): m = H2OGeneralizedLinearEstimator(alpha=[r['alphas'][l]],Lambda=r['lambdas'][l], solver='COORDINATE_DESCENT') m.train(x = predictors, y = response, training_frame = train, validation_frame = valid) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cs, m.coef(), tol=1e-6) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm(), tol=1e-6)
def test_get_future_model(): covtype=h2o.upload_file(pyunit_utils.locate("smalldata/covtype/covtype.altered.gz")) myY=54 myX=list(set(range(54)) - set([20,28])) # Cols 21 and 29 are constant, so must be explicitly ignored # Set response to be indicator of a particular class res_class=random.sample(range(1,5), 1)[0] covtype[myY] = covtype[myY] == res_class covtype[myY] = covtype[myY].asfactor() # L2: alpha=0, lambda=0 covtype_h2o1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=0) covtype_h2o1.start(x=myX, y=myY, training_frame=covtype) # Elastic: alpha=0.5, lambda=1e-4 covtype_h2o2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=1e-4) covtype_h2o2.start(x=myX, y=myY, training_frame=covtype) # L1: alpha=1, lambda=1e-4 covtype_h2o3 = H2OGeneralizedLinearEstimator(family="binomial", alpha=1, Lambda=1e-4) covtype_h2o3.start(x=myX, y=myY, training_frame=covtype) covtype_h2o1.join() print(covtype_h2o1) covtype_h2o2.join() print(covtype_h2o2) covtype_h2o3.join() print(covtype_h2o3)
def test_glm_multinomial_makeGLMModel(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) mL = glm(family='multinomial', alpha=[0.1], Lambda=[0.9]) d[54] = d[54].asfactor() mL.train(training_frame=d, x=list(range(0, 54)), y=54) r = glm.getGLMRegularizationPath(mL) rank = check_nonzero_coefs(r['coefficients'][0]) assert rank == mL._model_json["output"]["rank"], "expected rank: {0}, actual rank: {1}." \ "".format(rank, mL._model_json["output"]["rank"]) m2 = glm.makeGLMModel( model=mL, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model f1 = mL.predict(d) f2 = m2.predict(d) pyunit_utils.compare_frames_local(f1, f2, prob=1) coefs = r['coefficients'][0] coefs[ "wendy_dreams"] = 0.123 # add extra coefficients to model coefficient try: glm.makeGLMModel(model=mL, coefs=coefs) assert False, "Should have thrown an exception!" except Exception as ex: print(ex) temp = str(ex) assert ("Server error java.lang.IllegalArgumentException:" in temp) and \ ("model coefficient length 371 is different from coefficient provided by user") in temp, \ "Wrong exception was received." print("glm Multinomial makeGLMModel test completed!")
def test_relevel(): #First, compare againts itself print("Importing prostate_cat.csv data...\n") d = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv"), na_strings=["NA","NA","NA","NA","NA","NA","NA","NA"]) mh2o1 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip") mh2o1.train(x=list(range(1, d.ncol)), y=0, training_frame=d) ns = mh2o1.coef().keys() print(ns) assert "DPROS.None" in ns, "None level IS NOT expected to be skipped by default" assert "DPROS.Both" not in ns, "Both level IS expected to be skipped by default" x = d["DPROS"].relevel("None") print(x) d["DPROS"] = x[0] mh2o2 = H2OGeneralizedLinearEstimator(family = "binomial", Lambda=0, missing_values_handling = "Skip") mh2o2.train(x=list(range(1, d.ncol)), y=0, training_frame=d) ns2 = mh2o2.coef().keys() print(ns2) assert "DPROS.None" not in ns2, "None level IS NOT expected to be skipped by default" assert "DPROS.Both" in ns2, "Both level IS expected to be skipped by default" #Second, compare against R input (taken from runit_relevel.R) dr = h2o.import_file(path = pyunit_utils.locate("smalldata/prostate/prostate_cat.csv")) dr["DPROS"] = d["DPROS"].relevel("None") #Results are from R but manualy reordered and renamed to match h2o naming and order exp_coefs = {"Intercept": -7.63245 , "DPROS.Both": 1.39185, "DPROS.Left": 0.73482, "DPROS.Right": 1.51437, "RACE.White": 0.65160, "DCAPS.Yes": 0.49233, "AGE":-0.01189 , "PSA": 0.02990, "VOL": -0.01141, "GLEASON": 0.96466927} coeff_diff = {key: abs(exp_coefs[key] - mh2o2.coef().get(key, 0)) for key in exp_coefs.keys()} assert max(coeff_diff.values()) < 1e-4
def test_infogram_personal_loan(): """ Test to make sure predictor can be specified using infogram model. """ fr = h2o.import_file(path=pyunit_utils.locate( "smalldata/admissibleml_test/Bank_Personal_Loan_Modelling.csv")) target = "Personal Loan" fr[target] = fr[target].asfactor() x = [ "Experience", "Income", "Family", "CCAvg", "Education", "Mortgage", "Securities Account", "CD Account", "Online", "CreditCard" ] infogram_model = H2OInfogram(seed=12345, protected_columns=["Age", "ZIP Code"]) infogram_model.train(x=x, y=target, training_frame=fr) glm_model1 = H2OGeneralizedLinearEstimator() glm_model1.train(x=infogram_model._extract_x_from_model(), y=target, training_frame=fr) coef1 = glm_model1.coef() glm_model2 = H2OGeneralizedLinearEstimator() glm_model2.train(x=infogram_model, y=target, training_frame=fr) coef2 = glm_model2.coef() pyunit_utils.assertCoefDictEqual(coef1, coef2, tol=1e-6)
def covtype(): covtype = h2o.import_file(path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) # myY = 54 myX = [x for x in range(0,54) if x not in [20,28]] # Set response to be indicator of a particular class res_class = random.randint(1,4) covtype[54] = (covtype[54] == res_class) #covtype.summary() # L2: alpha = 0, lambda = 0 covtype_mod1 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=0) covtype_mod1.train(x=myX, y=myY, training_frame=covtype) covtype_mod1.show() # Elastic: alpha = 0.5, lambda = 1e-4 covtype_mod2 = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=1e-4) covtype_mod2.train(x=myX, y=myY, training_frame=covtype) covtype_mod2.show() # L1: alpha = 1, lambda = 1e-4 covtype_mod3 = H2OGeneralizedLinearEstimator(family="binomial", alpha=1, Lambda=1e-4) covtype_mod3.train(x=myX, y=myY, training_frame=covtype) covtype_mod3.show()
def test_negBinomial_makeGLMModel(): print("Read in prostate data.") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) print("Testing for family: Negative Binomial") print("Set variables for h2o.") myY = "GLEASON" myX = ["ID", "AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] thetas = [0.000000001, 0.01, 0.1, 0.5, 1] for thetaO in thetas: h2o_model_log = H2OGeneralizedLinearEstimator( family="negativebinomial", link="log", alpha=0.5, Lambda=0.0001, theta=thetaO) h2o_model_log.train(x=myX, y=myY, training_frame=h2o_data) predictModel = h2o_model_log.predict(h2o_data) r = H2OGeneralizedLinearEstimator.getGLMRegularizationPath( h2o_model_log) makeModel = H2OGeneralizedLinearEstimator.makeGLMModel( model=h2o_model_log, coefs=r['coefficients'] [0]) # model generated from setting coefficients to model predictMake = makeModel.predict(h2o_data) pyunit_utils.compare_frames_local(predictModel, predictMake, prob=1)
def glm_alpha_array_lambda_null(): # first test: compare coefficients and deviance d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) mL = glm(family='binomial',alpha=[0.1,0.5,0.9],solver='COORDINATE_DESCENT') mL.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) r = glm.getGLMRegularizationPath(mL) regKeys = ["alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train"] best_submodel_index = mL._model_json["output"]["best_submodel_index"] m2 = glm.makeGLMModel(model=mL,coefs=r['coefficients'][best_submodel_index]) dev1 = r['explained_deviance_train'][best_submodel_index] p2 = m2.model_performance(d) dev2 = 1-p2.residual_deviance()/p2.null_deviance() print(dev1," =?= ",dev2) assert abs(dev1 - dev2) < 1e-6 for l in range(0,len(r['lambdas'])): m = glm(family='binomial',alpha=[r['alphas'][l]],Lambda=[r['lambdas'][l]],solver='COORDINATE_DESCENT') m.train(training_frame=d,x=[2,3,4,5,6,7,8],y=1) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertEqualCoeffDicts(cs, m.coef()) pyunit_utils.assertEqualCoeffDicts(cs_norm, m.coef_norm()) p = m.model_performance(d) devm = 1-p.residual_deviance()/p.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr) if (l == best_submodel_index): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics(m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"]) else: # for other submodel, should have worse residual_deviance() than best submodel assert p.residual_deviance() >= p2.residual_deviance(), "Best submodel does not have lowerest " \ "residual_deviance()!"
def offset_init_train_glm(): # Connect to a pre-existing cluster cars = h2o.upload_file( pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) cars = cars[cars["economy_20mpg"].isna() == 0] cars["economy_20mpg"] = cars["economy_20mpg"].asfactor() offset = h2o.H2OFrame([[.5]] * 398) offset.set_names(["x1"]) cars = cars.cbind(offset) # offset_column passed in the train method glm_train = H2OGeneralizedLinearEstimator(family="binomial") glm_train.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars, offset_column="x1") predictions_train = glm_train.predict(cars) # offset_column passed in estimator init glm_init = H2OGeneralizedLinearEstimator(offset_column="x1", family="binomial") glm_init.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars) predictions_init = glm_init.predict(cars) # case the both offset column parameters are set and only the parameter in train will be used glm_init_train = H2OGeneralizedLinearEstimator(offset_column="x1-test", family="binomial") glm_init_train.train(x=list(range(2, 8)), y="economy_20mpg", training_frame=cars, offset_column="x1") predictions_init_train = glm_init_train.predict(cars) assert predictions_train == predictions_init, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in constructor." assert predictions_train == predictions_init_train, "Expected predictions of a model with offset_column in train method has to be same as predictions of a model with offset_column in both constructor and init."
def benign_grid(): training_data = h2o.import_file( pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = range(3) + range(4, 11) hyper_parameters = {'alpha': [0.01, 0.5, 'a'], 'lambda': [1e-5, 1e-6]} gs = H2OGridSearch(H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters) gs.train(x=X, y=Y, training_frame=training_data) gs.show() print gs.sort_by('F1', False) best_model_id = gs.sort_by('F1', False)['Model Id'][0] best_model = h2o.get_model(best_model_id) best_model.predict(training_data) gs.predict(training_data) print gs.get_hyperparams(best_model_id) print gs.grid_id new_g = H2OGridSearch.get_grid( H2OGeneralizedLinearEstimator(family='binomial'), hyper_parameters, gs.grid_id) new_g.show() print new_g.grid_id print new_g.sort_by('F1', False) assert best_model.params['family']['actual'] == 'binomial'
def glm_alpha_array_with_lambda_search_cv(): # read in the dataset and construct training set (and validation set) print("Testing glm cross-validation with alpha array, lambda_search for multiomial models.") h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build model with CV but no validation dataset cv_model = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3) cv_model.train(training_frame=training_data,x=myX,y=myY) cv_r = glm.getGLMRegularizationPath(cv_model) # build model with CV and with validation dataset cv_model_valid = glm(family='multinomial',alpha=[0.1,0.5,0.9], lambda_search=True, nfolds = 3) cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY) cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid) for l in range(0,len(cv_r['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
def check_same(data1, data2): glm1_regression = H2OGeneralizedLinearEstimator() glm1_regression.train(x=list(range(2, 20)), y=1, training_frame=data1) # glm1_regression = h2o.glm(x=data1[2:20], y=data1[1]) glm2_regression = H2OGeneralizedLinearEstimator( weights_column="weights") glm2_regression.train(x=list(range(2, 21)), y=1, training_frame=data2) # glm2_regression = h2o.glm(x=data2[2:21], y=data2[1], weights_column="weights", training_frame=data2) glm1_binomial = H2OGeneralizedLinearEstimator() glm1_binomial.train(x=list(range(1, 20)), y=0, training_frame=data1) # glm1_binomial = h2o.glm(x=data1[1:20], y=data1[0], family="binomial") glm2_binomial = H2OGeneralizedLinearEstimator(weights_column="weights", family="binomial") glm2_binomial.train(x=list(range(1, 21)), y=0, training_frame=data2) # glm2_binomial = h2o.glm(x=data2[1:21], y=data2[0], weights_column="weights", family="binomial",training_frame=data2) assert abs(glm1_regression.mse() - glm2_regression.mse()) < 1e-6, "Expected mse's to be the same, but got {0}, " \ "and {1}".format(glm1_regression.mse(), glm2_regression.mse()) assert abs(glm1_binomial.null_deviance() - glm2_binomial.null_deviance()) < 1e-6, \ "Expected null deviances to be the same, but got {0}, and {1}".format(glm1_binomial.null_deviance(), glm2_binomial.null_deviance()) assert abs(glm1_binomial.residual_deviance() - glm2_binomial.residual_deviance()) < 1e-6, \ "Expected residual deviances to be the same, but got {0}, and {1}".format(glm1_binomial.residual_deviance(), glm2_binomial.residual_deviance())
def test_gamma_dispersion_factor(): training_data = h2o.import_file( "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/gamma_dispersion_factor_9_10kRows.csv" ) Y = 'resp' x = ['abs.C1.', 'abs.C2.', 'abs.C3.', 'abs.C4.', 'abs.C5.'] model = H2OGeneralizedLinearEstimator(family='gamma', lambda_=0, compute_p_values=True, dispersion_factor_method="ml") model.train(training_frame=training_data, x=x, y=Y) model_pearson = H2OGeneralizedLinearEstimator( family='gamma', lambda_=0, compute_p_values=True, dispersion_factor_method="pearson") model_pearson.train(training_frame=training_data, x=x, y=Y) true_dispersion_factor = 9 R_dispersion_factor = 9.3 dispersion_factor_estimated = model._model_json["output"]["dispersion"] dispersion_factor_estimated_pearson = model_pearson._model_json["output"][ "dispersion"] print( "True dispersion parameter {0}. Estiamted ml dispersion parameter {1}. Estimated pearson dispersion " "parameter {2}.".format(true_dispersion_factor, dispersion_factor_estimated, dispersion_factor_estimated_pearson)) assert abs(true_dispersion_factor-dispersion_factor_estimated) <= abs(R_dispersion_factor-true_dispersion_factor),\ "H2O dispersion parameter estimate {0} is worse than that of R {1}. True dispersion parameter is " \ "{2}".format( dispersion_factor_estimated, R_dispersion_factor, true_dispersion_factor) assert abs(true_dispersion_factor-dispersion_factor_estimated) <= abs(dispersion_factor_estimated_pearson-true_dispersion_factor), \ "H2O dispersion parameter ml estimate {0} is worse than that of H2O dispersion parameter pearson estimate {1}." \ " True dispersion parameter is {2}".format( dispersion_factor_estimated, dispersion_factor_estimated_pearson, true_dispersion_factor)
def glm_alpha_arrays_null_lambda_cv(): print("Testing glm cross-validation with alpha array, default lambda values for binomial models.") h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) enum_columns = ["C1", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "C9", "C10"] for cname in enum_columns: h2o_data[cname] = h2o_data[cname] myY = "C21" h2o_data["C21"] = h2o_data["C21"].asfactor() myX = h2o_data.names.remove(myY) data_frames = h2o_data.split_frame(ratios=[0.8]) training_data = data_frames[0] test_data = data_frames[1] # build model with CV but no validation dataset cv_model = glm(family='binomial',alpha=[0.1,0.5,0.9], nfolds = 3, fold_assignment="modulo") cv_model.train(training_frame=training_data,x=myX,y=myY) cv_r = glm.getGLMRegularizationPath(cv_model) # build model with CV and with validation dataset cv_model_valid = glm(family='binomial',alpha=[0.1,0.5,0.9], nfolds = 3, fold_assignment="modulo") cv_model_valid.train(training_frame=training_data, validation_frame = test_data, x=myX,y=myY) cv_r_valid = glm.getGLMRegularizationPath(cv_model_valid) for l in range(0,len(cv_r['lambdas'])): print("comparing coefficients for submodel {0}".format(l)) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients'][l], cv_r_valid['coefficients'][l], tol=1e-6) pyunit_utils.assertEqualCoeffDicts(cv_r['coefficients_std'][l], cv_r_valid['coefficients_std'][l], tol=1e-6)
def link_incompatible_error(): print("Reading in original prostate data.") prostate = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate.csv.zip")) print( "Throw error when trying to create model with incompatible logit link." ) try: model = H2OGeneralizedLinearEstimator(family="gaussian", link="logit") model.train(x=list(range(1, 8)), y=8, training_frame=prostate) assert False, "expected an error" except EnvironmentError: assert True try: model = H2OGeneralizedLinearEstimator(family="tweedie", link="log") model.train(x=list(range(1, 8)), y=8, training_frame=prostate) assert False, "expected an error" except EnvironmentError: assert True try: model = H2OGeneralizedLinearEstimator(family="binomial", link="inverse") model.train(x=list(range(2, 9)), y=1, training_frame=prostate) assert False, "expected an error" except EnvironmentError: assert True
def glm_alpha_array_lambda_null(): # first test: compare coefficients and deviance keySets = ["MSE", "null_deviance", "logloss", "RMSE", "r2"] d = h2o.import_file( path=pyunit_utils.locate("smalldata/covtype/covtype.20k.data")) mL = glm(family='multinomial', alpha=[0.1, 0.5, 0.9], Lambda=[0.1, 0.5, 0.9], cold_start=True) d[54] = d[54].asfactor() mL.train(training_frame=d, x=list(range(0, 54)), y=54) r = glm.getGLMRegularizationPath(mL) regKeys = [ "alphas", "lambdas", "explained_deviance_valid", "explained_deviance_train" ] best_submodel_index = mL._model_json["output"]["best_submodel_index"] coefClassSet = [ 'coefs_class_0', 'coefs_class_1', 'coefs_class_2', 'coefs_class_3', 'coefs_class_4', 'coefs_class_5', 'coefs_class_6', 'coefs_class_7' ] coefClassSetNorm = [ 'std_coefs_class_0', 'std_coefs_class_1', 'std_coefs_class_2', 'std_coefs_class_3', 'std_coefs_class_4', 'std_coefs_class_5', 'std_coefs_class_6', 'std_coefs_class_7' ] groupedClass = d.group_by("C55") groupedClass.count() classFrame = groupedClass.get_frame() classProb = classFrame[1] / d.nrow coeffIndex = [52, 105, 158, 211, 264, 317, 370] startVal = [0] * 371 for ind in range(classProb.nrow): startVal[coeffIndex[ind]] = math.log(classProb[ind, 0]) for l in range(0, len(r['lambdas'])): m = glm(family='multinomial', alpha=[r['alphas'][l]], Lambda=[r['lambdas'][l]], startval=startVal) m.train(training_frame=d, x=list(range(0, 54)), y=54) mr = glm.getGLMRegularizationPath(m) cs = r['coefficients'][l] cs_norm = r['coefficients_std'][l] pyunit_utils.assertCoefEqual(cs, m.coef(), coefClassSet) pyunit_utils.assertCoefEqual(cs_norm, m.coef_norm(), coefClassSetNorm) devm = 1 - m.residual_deviance() / m.null_deviance() devn = r['explained_deviance_train'][l] assert abs(devm - devn) < 1e-4 pyunit_utils.assertEqualRegPaths(regKeys, r, l, mr) if (l == best_submodel_index ): # check training metrics, should equal for best submodel index pyunit_utils.assertEqualModelMetrics( m._model_json["output"]["training_metrics"], mL._model_json["output"]["training_metrics"], tol=1e-2, keySet=keySets) else: # for other submodel, should have worse residual_deviance() than best submodel assert m.logloss() >= mL.logloss(), "Best submodel does not have lowerest " \ "logloss()!"
def logistic_regression(xval=None, sample_size=None, nfolds=None, hparams=None, for_stacking=None): """ create a logistic regression algorithm estimator Note: 1. standardize: True(default) 3. missing_values_handling: mean_imputation(default) :param xval: if for cross-validation :param sample_size: training set sample amount :param nfolds: k value for k-fold cross-validation :param hparams: hyper parameters for grid search :param for_stacking: if it is used for stacking :return: a constructed logistic regression estimator, a parameters' dict for grid search """ if sample_size <= 10000: if sample_size <= 5000: default_nfolds = 3 else: default_nfolds = 5 alpha_opts = [0, 0.25, 0.5, 0.75, 1] lambda_opts = [1, 0.5, 0.1, 0.01, 0] elif 10000 < sample_size <= 100000: default_nfolds = 3 alpha_opts = [0, 0.5, 1] lambda_opts = [1, 0.5, 0.1, 0.01, 0] else: default_nfolds = 2 alpha_opts = [0, 0.5, 1] lambda_opts = [1, 0.5, 0.1, 0] default_hparams = dict({'alpha': alpha_opts, 'lambda': lambda_opts}) if nfolds is None: nfolds = default_nfolds if hparams is None: hparams = default_hparams if xval: if for_stacking: lr_estimator = H2OGeneralizedLinearEstimator(family="binomial", remove_collinear_columns=True, max_iterations=50, nfolds=nfolds, fold_assignment="Modulo", seed=1, keep_cross_validation_predictions=True) else: lr_estimator = H2OGeneralizedLinearEstimator(family="binomial", remove_collinear_columns=True, max_iterations=50, nfolds=nfolds) else: lr_estimator = H2OGeneralizedLinearEstimator(family="binomial", remove_collinear_columns=True, max_iterations=50) return lr_estimator, hparams