def buildModelMetricsCheck(train_data, family): x = [] y = "response" if not (family == 'gaussian'): train_data[y] = train_data[y].asfactor() frames = train_data.split_frame(ratios=[0.9], seed=12345) h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=["C1"], seed=12345) h2o_model.train(x=x, y=y, training_frame=frames[0], validation_frame=frames[1]) h2o_model2 = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=["C1"], seed=12345) h2o_model2.train(x=x, y=y, training_frame=frames[0], validation_frame=frames[1]) # check and make sure coefficient does not contain predictor column coeffNames = h2o_model.coef().keys() assert not "C1" in coeffNames, "Not expecting C1 to be a coefficient but it is." # check and make sure both model produce the same metrics if family == 'gaussian': assert h2o_model.mse() == h2o_model2.mse( ), "Expected model MSE: {0}, Actual: {1}".format( h2o_model.mse(), h2o_model2.mse()) else: assert h2o_model.logloss() == h2o_model2.logloss(), "Expected model logloss: {0}, Actual: " \ "{1}".format(h2o_model.logloss(), h2o_model2.logloss())
def test_gam_knots_key(): print("Checking coefficients and variable importance for multinomial") knots1 = [-49.98693927762423, -25.286098564527954, 0.44703511170863297, 25.50661829462607, 49.97312855846752] frameKnots1 = h2o.H2OFrame(python_obj=knots1) knots2 = [-49.99386508664034, -25.275868426388616, 0.012500153211602433, 25.13371167580791, 49.98738587466542] frameKnots2 = h2o.H2OFrame(python_obj=knots2) knots3 = [-49.99241697497996, -24.944012655490237, 0.1578389050436152, 25.296897954643736, 49.9876932143425] frameKnots3 = h2o.H2OFrame(python_obj=knots3) h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/gaussian_20cols_10000Rows.csv")) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() y = "C21" x=["C1","C2"] numKnots = [5,5,5] h2o_model = H2OGeneralizedAdditiveEstimator(family='gaussian', gam_columns=["C11","C12","C13"], scale = [1,1,1], num_knots=numKnots, bs=[2, 2, 0], seed=12345, knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key]) h2o_model.train(x=x, y=y, training_frame=h2o_data) h2oCoeffs = h2o_model.coef() h2o_model2 = H2OGeneralizedAdditiveEstimator(family='gaussian', gam_columns=["C11","C12","C13"], scale = [1,1,1], num_knots=numKnots, bs=[2, 2, 0], seed=12345) h2o_model2.train(x=x, y=y, training_frame=h2o_data) h2oCoeffs2 = h2o_model2.coef() keyNames = h2oCoeffs.keys() for kNames in keyNames: assert abs(h2oCoeffs[kNames]-h2oCoeffs2[kNames]) < 1e-6, "expected coefficients: {0}. actual coefficients: " \ "{1}".format(h2oCoeffs[kNames], h2oCoeffs2[kNames]) print("gam knot keys test completed successfully")
def buildModelScaleParam(train_data, y, gamX, family): numKnots = [5, 6, 7] x = ["C1", "C2"] h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=[0.001, 0.001, 0.001], bs=[0, 1, 2], num_knots=numKnots) h2o_model.train(x=x, y=y, training_frame=train_data) h2o_model2 = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=[10, 10, 10], num_knots=numKnots, bs=[0, 1, 2]) h2o_model2.train(x=x, y=y, training_frame=train_data) if family == 'binomial': logloss1 = h2o_model.logloss() logloss2 = h2o_model2.logloss() assert not ( logloss1 == logloss2 ), "logloss from models with different scale parameters should be different but is not." else: mse1 = h2o_model.mse() mse2 = h2o_model2.mse() assert not ( mse1 == mse2 ), "mse from models with different scale parameters should be different but is not."
def test_gam_knots_key(): # bad gam_column with not enough values is chosen to be gam_column h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) h2o_data.head() myY = "CAPSULE" myX = ["ID", "AGE", "RACE", "GLEASON", "DCAPS", "PSA", "VOL", "DPROS"] h2o_data[myY] = h2o_data[myY].asfactor() try: h2o_model = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["GLEASON"], bs=[2], num_knots=[12]) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) assert False, "Should have throw exception due to bad gam_column choice" except Exception as ex: print(ex) temp = str(ex) assert "does have not enough values to generate well-defined knots" in temp, "wrong error message received." # knots not chosen in ascending error and corresponding error message knots1 = [-0.98143075, -1.99905699, 0.02599159, 1.00770987, 1.99942290] frameKnots1 = h2o.H2OFrame(python_obj=knots1) try: h2o_model = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["GLEASON"], knot_ids=[frameKnots1.key], bs=[2]) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) assert False, "Should have throw exception due to bad knot location choices" except Exception as ex: print(ex) temp = str(ex) assert "knots not sorted in ascending order for gam_column" in temp, "wrong error message received."
def link_functions_tweedie_vpow(): np.random.seed(25) data = { "predictor": np.random.uniform(400, 800, 15), "target": np.random.uniform(0.7, 1.4, 15), "weight_1": [1] * 15, "weight_2": [3] * 15, } df = h2o.H2OFrame(pd.DataFrame(data)) model_w1 = H2OGeneralizedAdditiveEstimator(family='gaussian', gam_columns=["predictor"], scale=[1], bs=[2], weights_column='weight_1') model_w2 = H2OGeneralizedAdditiveEstimator(family='gaussian', gam_columns=["predictor"], scale=[1], bs=[2], weights_column='weight_2') model = H2OGeneralizedAdditiveEstimator(family='gaussian', gam_columns=["predictor"], scale=[1], bs=[2]) model_w1.train(x=["predictor"], y="target", training_frame=df) model_w2.train(x=["predictor"], y="target", training_frame=df) model.train(x=["predictor"], y="target", training_frame=df)
def test_gam_model_predict(): print("Checking early-stopping for binomial") print("Preparing for data....") h2o_data = h2o.import_file(pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_data["C3"] = h2o_data["C3"].asfactor() h2o_data["C4"] = h2o_data["C4"].asfactor() h2o_data["C5"] = h2o_data["C5"].asfactor() h2o_data["C6"] = h2o_data["C6"].asfactor() h2o_data["C7"] = h2o_data["C7"].asfactor() h2o_data["C8"] = h2o_data["C8"].asfactor() h2o_data["C9"] = h2o_data["C9"].asfactor() h2o_data["C10"] = h2o_data["C10"].asfactor() myY = "C21" h2o_data["C21"] = h2o_data["C21"].asfactor() splits = h2o_data.split_frame(ratios=[0.8], seed=12345) train = splits[0] test = splits[1] early_stop_metrics = ["logloss", "AUC"] early_stop_valid_metrics = ["validation_logloss", "validation_auc"] max_stopping_rounds = 3 # maximum stopping rounds allowed to be used for early stopping metric max_tolerance = 0.1 # maximum tolerance to be used for early stopping metric bigger_is_better = [False, True] print("Building a GAM model without early stop") h2o_model_no_early_stop = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["C11"], scale = [0.0001], score_each_iteration=True) h2o_model_no_early_stop.train(x=list(range(0,20)), y=myY, training_frame=train, validation_frame=test) for ind in range(len(early_stop_metrics)): print("Building early-stop model") h2o_model = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["C11"], scale = [0.0001], stopping_rounds=max_stopping_rounds,score_each_iteration=True, stopping_metric=early_stop_metrics[ind], stopping_tolerance=max_tolerance) h2o_model.train(x=list(range(0,20)), y="C21", training_frame=train, validation_frame=test) metric_list1 = \ pyunit_utils.extract_field_from_twoDimTable( h2o_model._model_json["output"]["glm_scoring_history"].col_header, h2o_model._model_json["output"]["glm_scoring_history"].cell_values, early_stop_valid_metrics[ind]) print("Checking if early stopping has been done correctly for {0}.".format(early_stop_metrics[ind])) assert pyunit_utils.evaluate_early_stopping(metric_list1, max_stopping_rounds, max_tolerance, bigger_is_better[ind]), \ "Early-stopping was not incorrect." print("Check if lambda_search=True, early-stop enabled, an error should be thrown.") try: h2o_model = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["C11"], scale = [0.0001], stopping_rounds=max_stopping_rounds,score_each_iteration=True, stopping_metric=early_stop_metrics[ind], stopping_tolerance=max_tolerance, lambda_search=True, nlambdas=3) h2o_model.train(x=list(range(0,20)), y=myY, training_frame=train, validation_frame=test) assert False, "Exception should have been risen when lambda_search=True and early stop is enabled" except Exception as ex: print(ex) temp = str(ex) assert ("early stop: cannot run when lambda_search=True. Lambda_search has its own early-stopping " "mechanism" in temp), "Wrong exception was received." print("early-stop test passed!")
def test_compare_R(): myX = [ 'c_0', 'c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7', 'c_8', 'c_9', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10' ] myY = 'response' gamCols = [["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]] bsT = [1, 1, 1] scaleP = [0.001, 0.001, 0.001] numKnots = [10, 10, 12] print("Comparing H2O and R GAM performance for binomial") dataBinomial = h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv")) dataBinomial["C3"] = dataBinomial["C3"].asfactor() dataBinomial["C7"] = dataBinomial["C7"].asfactor() dataBinomial["C8"] = dataBinomial["C8"].asfactor() dataBinomial["C10"] = dataBinomial["C10"].asfactor() dataBinomial["response"] = dataBinomial["response"].asfactor() frames = dataBinomial.split_frame(ratios=[0.8], seed=1234) trainB = frames[0] testB = frames[1] gamB = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=gamCols, bs=bsT, scale=scaleP, num_knots=numKnots, lambda_search=True) gamB.train(x=myX, y=myY, training_frame=trainB, validation_frame=testB) gamPred = gamB.predict(testB) temp = gamPred['predict'] == testB['response'] gamBacc = 1 - temp.mean()[0, 0] rAcc = 0.01457801 print("R accuracy: {0}, H2O accuracy: {1}.".format(rAcc, gamBacc)) assert gamBacc <= rAcc, "R mean error rate: {0}, H2O mean error rate: {1}. R performs better." \ "".format(rAcc, gamBacc) print("Comparing H2O and R GAM performance for gaussian") dataGaussian = h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/synthetic_20Cols_gaussian_20KRows.csv")) dataGaussian["C3"] = dataGaussian["C3"].asfactor() dataGaussian["C7"] = dataGaussian["C7"].asfactor() dataGaussian["C8"] = dataGaussian["C8"].asfactor() dataGaussian["C10"] = dataGaussian["C10"].asfactor() frames = dataGaussian.split_frame(ratios=[0.8], seed=1234) trainB = frames[0] testB = frames[1] gamG = H2OGeneralizedAdditiveEstimator(family='gaussian', gam_columns=gamCols, bs=bsT, scale=scaleP, num_knots=numKnots, lambda_search=True) gamG.train(x=myX, y=myY, training_frame=trainB, validation_frame=testB) gamMSE = gamG.model_performance(valid=True).mse() rMSE = 0.0006933308 print("R MSE: {0}, H2O MSE: {1}.".format(rMSE, gamMSE)) assert gamMSE <= rMSE, "R MSE: {0}, H2O MSE: {1}. R performs better." \ "".format(rMSE, gamMSE)
def test_gridsearch(): h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv")) h2o_data['response'] = h2o_data['response'].asfactor() h2o_data['C3'] = h2o_data['C3'].asfactor() h2o_data['C7'] = h2o_data['C7'].asfactor() h2o_data['C8'] = h2o_data['C8'].asfactor() h2o_data['C10'] = h2o_data['C10'].asfactor() names = h2o_data.names myY = "response" myX = names.remove(myY) search_criteria = {'strategy': 'Cartesian'} hyper_parameters = { 'lambda': [1, 2], 'subspaces': [{ 'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs': [[1], [0]], 'gam_columns': [[["c_0"]], [["c_1"]]] }, { 'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 'bs': [[1, 1, 1], [0, 1, 1]], 'num_knots': [[5, 10, 12], [6, 11, 13]], 'gam_columns': [[["c_0"], ["c_1", "c_2"], ["c_3", "c_4", "c_5"]], [["c_1"], ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]] }] } hyper_parameters2 = { 'lambda': [1, 2], 'subspaces': [{ 'scale': [[0.001], [0.0002]], 'num_knots': [[5], [10]], 'bs': [[1], [0]], 'gam_columns': [[["c_0"]], [["c_1"]]] }, { 'scale': [[0.001, 0.001, 0.001], [0.0002, 0.0002, 0.0002]], 'bs': [[1, 1, 1], [0, 1, 1]], 'num_knots': [[5, 10, 12], [6, 11, 13]], 'gam_columns': [["c_0", ["c_1", "c_2"], ["c_3", "c_4", "c_5"]], ["c_1", ["c_2", "c_3"], ["c_4", "c_5", "c_6"]]] }] } h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator( family="binomial", keep_gam_cols=True), hyper_params=hyper_parameters, search_criteria=search_criteria) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) h2o_model2 = H2OGridSearch(H2OGeneralizedAdditiveEstimator( family="binomial", keep_gam_cols=True), hyper_params=hyper_parameters2, search_criteria=search_criteria) h2o_model2.train(x=myX, y=myY, training_frame=h2o_data) # compare two models by checking their coefficients. They should be the same for index in range(0, len(h2o_model)): model1 = h2o_model[index] model2 = h2o_model2[index] pyunit_utils.assertEqualCoeffDicts(model1.coef(), model2.coef(), tol=1e-6)
def buildModelMetricsCheck(train_data, test_data, model_test_data, y, gamX, family): numKnots = [5, 6, 7] x = ["C1", "C2"] numCoeffs = len(train_data["C1"].categories()) + len( train_data["C2"].categories()) + sum(numKnots) + 1 - len(numKnots) h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=[1, 1, 1], num_knots=numKnots, standardize=True, Lambda=[0], alpha=[0], max_iterations=3) h2o_model.train(x=x, y=y, training_frame=train_data) if family == 'binomial': h2o_model.auc() h2o_model.aic() h2o_model.logloss() h2o_model.null_deviance() h2o_model.residual_deviance() elif family == 'multinomial': h2o_model.null_deviance() h2o_model.residual_deviance() else: h2o_model.mse() h2o_model.null_deviance() h2o_model.residual_deviance()
def train_models(self): self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator( family = "gaussian", gam_columns = ["C11", "C12", "C13"], keep_gam_cols = True), self.hyper_parameters) self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data) for model in self.manual_gam_models: model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data)
def link_functions_tweedie_vpow(): np.random.seed(1234) n_rows = 10 data = { "X1": np.random.randn(n_rows), "X2": np.random.randn(n_rows), "X3": np.random.randn(n_rows), "W": np.random.choice([10, 20], size=n_rows), "Y": np.random.choice([0, 0, 0, 0, 0, 10, 20, 30], size=n_rows) } train = h2o.H2OFrame(pd.DataFrame(data)) test = train.drop("W") print(train) h2o_model = H2OGeneralizedAdditiveEstimator(family="tweedie", gam_columns=["X3"], weights_column="W", lambda_=0, tweedie_variance_power=1.5, bs=[2], tweedie_link_power=0) h2o_model.train(x=["X1", "X2"], y="Y", training_frame=train) predict_w = h2o_model.predict(train) predict = h2o_model.predict(test) # scoring without weight column # should produce same frame pyunit_utils.compare_frames_local(predict_w, predict, prob=1, tol=1e-6)
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ self.h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) self.h2o_data["C1"] = self.h2o_data["C1"].asfactor() self.h2o_data["C2"] = self.h2o_data["C2"].asfactor() self.myX = ["C1", "C2"] self.myY = "C21" for alpha in self.hyper_parameters["alpha"]: for subspace in self.hyper_parameters["subspaces"]: for scale in subspace['scale']: for gam_columns in subspace['gam_columns']: for num_knots in subspace['num_knots']: for bs in subspace['bs']: self.manual_gam_models.append( H2OGeneralizedAdditiveEstimator( family="gaussian", gam_columns=gam_columns, keep_gam_cols=True, scale=scale, num_knots=num_knots, alpha=alpha, bs=bs))
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ self.h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) self.h2o_data["C1"] = self.h2o_data["C1"].asfactor() self.h2o_data["C2"] = self.h2o_data["C2"].asfactor() self.myX = ["C1", "C2"] self.myY = "C21" for lambda_param in self.hyper_parameters['lambda']: for alpha_param in self.hyper_parameters['alpha']: for scale_param in self.hyper_parameters['scale']: for num_knots_param in self.hyper_parameters['num_knots']: self.manual_gam_models.append( H2OGeneralizedAdditiveEstimator( family="gaussian", gam_columns=["C11", "C12", "C13"], keep_gam_cols=True, scale=scale_param, bs=[2, 2, 0], num_knots=num_knots_param, alpha=alpha_param, lambda_=lambda_param))
def test_gam_transformed_frame_serialization(): h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myX = ["C1", "C2"] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial", gam_columns=["C6", "C7", "C8"], keep_gam_cols=True, scale=[1, 1, 1], num_knots=[5, 5, 5]) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) gam_frame = h2o.get_frame( h2o_model._model_json["output"]["gam_transformed_center_key"]) tmpdir = tempfile.mkdtemp() filename = os.path.join(tmpdir, "gamXFrame.csv") h2o.download_csv(gam_frame, filename) model_path = h2o.save_model(h2o_model, tmpdir) h2o.remove_all() loaded_model = h2o.load_model(model_path) gam_frame_loaded = h2o.get_frame( loaded_model._model_json["output"]["gam_transformed_center_key"]) gam_frame_original = h2o.import_file(filename) pyunit_utils.compare_frames_local(gam_frame_loaded[2:15], gam_frame_original[2:15], prob=1, tol=1e-6) print("Test completed.")
def knots_error(): # load and prepare California housing dataset np.random.seed(1234) data = h2o.H2OFrame( python_obj={ 'C1': list(np.random.randint(0, 9, size=1000)), 'target': list(np.random.randint(0, 2, size=1000)) }) # use only 3 features and transform into classification problem feature_names = ['C1'] data['target'] = data['target'].asfactor() # split into train and validation sets train, test = data.split_frame([0.8], seed=1234) # build the GAM model h2o_model = H2OGeneralizedAdditiveEstimator( family='binomial', gam_columns=feature_names, scale=[1], num_knots=[10], ) try: h2o_model.train(x=feature_names, y='target', training_frame=train) assert False, "Number of knots validation should have failed" except Exception as ex: exception = str(ex) assert ("H2OModelBuilderIllegalArgumentException" in exception) assert ("has cardinality lower than the number of knots" in exception) assert ( "chosen gam_column C1 does have not enough values to generate well-defined knots" in exception) print("Error correctly raised when cardinality < num_knots")
def gam_train_metrics_recalculate(family): np.random.seed(1234) n_rows = 1000 data = { "X1": np.random.randn(n_rows), "X2": np.random.randn(n_rows), "X3": np.random.randn(n_rows), "W": np.random.choice([10, 20], size=n_rows), "Y": np.random.choice([0, 0, 0, 0, 0, 10, 20, 30], size=n_rows) + 0.1 } train = h2o.H2OFrame(pd.DataFrame(data)) test = train.drop("W") print(train) h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=["X3"], weights_column="W", lambda_=0, bs=[2], tweedie_variance_power=1.5, tweedie_link_power=0) h2o_model.train(x=["X1", "X2"], y="Y", training_frame=train) # force H2O to recalculate metrics instead just taking them from metrics cache train_clone = h2o.H2OFrame(pd.DataFrame(data)) print("GAM performance with test_data=train: {0}, with test_data=test: {1} and train=True: " "{2}".format(h2o_model.model_performance(test_data=train)._metric_json["MSE"], h2o_model.model_performance(test_data=test)._metric_json["MSE"], h2o_model.model_performance(train=True)._metric_json["MSE"])) assert abs(h2o_model.model_performance(test_data=train_clone)._metric_json["MSE"] - h2o_model.model_performance(train=True)._metric_json["MSE"]) < 1e-6
def buildModelCheckPredict(train_data, test_data, model_test_data, myy, gamX, family, actual_family): numKnots = [5, 5, 5] x = ["C1", "C2"] h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=[1, 1, 1], num_knots=numKnots, standardize=True, lambda_=[0], alpha=[0], max_iterations=3, compute_p_values=False, solver="irlsm") h2o_model.train(x=x, y=myy, training_frame=train_data) pred = h2o_model.predict(test_data) pred_mojo = as_mojo_model(h2o_model).predict(test_data) if pred.ncols < model_test_data.ncols: ncolT = model_test_data.ncols - 1 model_test_data = model_test_data.drop(ncolT) model_test_data.set_names(pred.names) if family == 'gaussian' or (family == 'AUTO' and actual_family == 'gaussian'): pyunit_utils.compare_frames_local(pred, model_test_data, prob=1) pyunit_utils.compare_frames_local(pred_mojo, model_test_data, prob=1) else: pred = pred.drop('predict') pred_mojo = pred_mojo.drop('predict') model_test_data = model_test_data.drop('predict') pyunit_utils.compare_frames_local(pred, model_test_data, prob=1) pyunit_utils.compare_frames_local(pred_mojo, model_test_data, prob=1) return pred
def buildModelCoeffVarimpCheck(train_data, y, gamX, family): numKnots = [5, 6, 7] x = ["C1", "C2"] numPCoeffs = len(train_data["C1"].categories()) + len( train_data["C2"].categories()) + sum(numKnots) + 1 - len(numKnots) h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale=[1, 1, 1], num_knots=numKnots) h2o_model.train(x=x, y=y, training_frame=train_data) h2oCoeffs = h2o_model.coef() nclass = 1 if family == 'multinomial': nclass = len(train_data[y].categories()) h2oCoeffs = h2oCoeffs['coefficients'] assert len(h2oCoeffs)==numPCoeffs*nclass, "expected number of coefficients: {0}, actual number of coefficients: " \ "{1}".format(numPCoeffs*nclass, len(h2oCoeffs)) h2oCoeffsStandardized = h2o_model.coef_norm() if family == 'multinomial': h2oCoeffsStandardized = h2oCoeffsStandardized[ 'standardized_coefficients'] assert len(h2oCoeffsStandardized)==numPCoeffs*nclass, "expected number of coefficients: {0}, actual number of " \ "coefficients:{1}".format(numPCoeffs*nclass, len(h2oCoeffsStandardized)) varimp = h2o_model.varimp() # exclude the intercept term here assert len(varimp)==(numPCoeffs-1), "expected number of coefficients: {0}, actual number of " \ "coefficients:{1}".format(numPCoeffs-1, len(varimp))
def train_models(self): self.h2o_model = H2OGridSearch(H2OGeneralizedAdditiveEstimator(family="gaussian", keep_gam_cols=True), hyper_params=self.hyper_parameters, search_criteria=self.search_criteria) self.h2o_model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data) for model in self.manual_gam_models: model.train(x = self.myX, y = self.myY, training_frame = self.h2o_data) print("done")
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ self.h2o_data = \ h2o.import_file(path = pyunit_utils.locate("smalldata/gam_test/synthetic_20Cols_binomial_20KRows.csv")) self.h2o_data['response'] = self.h2o_data['response'].asfactor() self.h2o_data['C3'] = self.h2o_data['C3'].asfactor() self.h2o_data['C7'] = self.h2o_data['C7'].asfactor() self.h2o_data['C8'] = self.h2o_data['C8'].asfactor() self.h2o_data['C10'] = self.h2o_data['C10'].asfactor() names = self.h2o_data.names self.myY = "response" self.myX = names.remove(self.myY) for lambda_ in self.hyper_parameters["lambda"]: for subspace in self.hyper_parameters["subspaces"]: for scale in subspace['scale']: for gam_columns in subspace['gam_columns']: for num_knots in subspace['num_knots']: for bsVal in subspace['bs']: self.manual_model_count += 1 self.manual_gam_models.append( H2OGeneralizedAdditiveEstimator( family="binomial", gam_columns=gam_columns, scale=scale, num_knots=num_knots, bs=bsVal, lambda_=lambda_, seed=1234))
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ self.h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) self.h2o_data["C1"] = self.h2o_data["C1"].asfactor() self.h2o_data["C2"] = self.h2o_data["C2"].asfactor() self.myX = ["C1", "C2"] self.myY = "C11" self.h2o_data["C11"] = self.h2o_data["C11"].asfactor() for lambda_ in self.hyper_parameters["lambda"]: for subspace in self.hyper_parameters["subspaces"]: for scale in subspace['scale']: for gam_columns in subspace['gam_columns']: for num_knots in subspace['num_knots']: self.manual_gam_models.append( H2OGeneralizedAdditiveEstimator( family="multinomial", gam_columns=gam_columns, keep_gam_cols=True, scale=scale, num_knots=num_knots, lambda_=lambda_))
def test_gam_gamColumns(): h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myX = ["C1", "C2"] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial", gam_columns=["C6", "C7", "C8"], keep_gam_cols=True, scale=[1, 1, 1], num_knots=[5, 5, 5]) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) gamFrame = h2o.get_frame( h2o_model._model_json["output"]["gam_transformed_center_key"]) gamFrame = gamFrame.drop("C1").drop("C2").drop("C11") gamFrameAns = h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C6Gam_center.csv" )) gamFrameAns = gamFrameAns.cbind( h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C7Gam_center.csv" ))) gamFrameAns = gamFrameAns.cbind( h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/multinomial_10_classes_10_cols_10000_Rows_train_C8Gam_center.csv" ))) pyunit_utils.compare_frames_local(gamFrameAns, gamFrame) print("gam gamcolumn test completed successfully")
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ self.h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) names = self.h2o_data.names counter = 0 for name in names: self.h2o_data[name] = self.h2o_data[name].asfactor() counter = counter + 1 if counter > 9: break self.myY = "C21" self.myX = names.remove(self.myY) for lambda_ in self.hyper_parameters["lambda"]: for subspace in self.hyper_parameters["subspaces"]: for scale in subspace['scale']: for gam_columns in subspace['gam_columns']: for num_knots in subspace['num_knots']: for bsVal in subspace['bs']: self.manual_model_count += 1 self.manual_gam_models.append( H2OGeneralizedAdditiveEstimator( family="gaussian", gam_columns=gam_columns, scale=scale, num_knots=num_knots, bs=bsVal, lambda_=lambda_))
def test_gam_knots_key(): print("Checking coefficients and variable importance for multinomial") knots1 = [-1.99905699, -0.98143075, 0.02599159, 1.00770987, 1.99942290] frameKnots1 = h2o.H2OFrame(python_obj=knots1) knots2 = [ -1.999821861, -1.005257990, -0.006716042, 1.002197392, 1.999073589 ] frameKnots2 = h2o.H2OFrame(python_obj=knots2) knots3 = [ -1.999675688, -0.979893796, 0.007573327, 1.011437347, 1.999611676 ] frameKnots3 = h2o.H2OFrame(python_obj=knots3) h2o_data = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() y = "C11" x = ["C1", "C2"] h2o_data["C11"] = h2o_data["C11"].asfactor() numKnots = [5, 5, 5] h2o_model = H2OGeneralizedAdditiveEstimator( family='multinomial', gam_columns=["C6", "C7", "C8"], scale=[1, 1, 1], num_knots=numKnots, knot_ids=[frameKnots1.key, frameKnots2.key, frameKnots3.key]) h2o_model.train(x=x, y=y, training_frame=h2o_data) h2oCoeffs = h2o_model.coef() h2o_model2 = H2OGeneralizedAdditiveEstimator( family='multinomial', gam_columns=["C6", "C7", "C8"], scale=[1, 1, 1], num_knots=numKnots) h2o_model2.train(x=x, y=y, training_frame=h2o_data) h2oCoeffs2 = h2o_model2.coef() keyNames = h2oCoeffs["coefficients"].keys() for kNames in keyNames: assert abs( h2oCoeffs["coefficients"][kNames] - h2oCoeffs2["coefficients"][kNames] ) < 1e-6, "expected coefficients: {0}. actual coefficients: {1}".format( h2oCoeffs["coefficients"][kNames], h2oCoeffs2["coefficients"][kNames]) print("gam knot keys test completed successfully")
def import_gam_mojo_regression(family): np.random.seed(1234) n_rows = 10 data = { "X1": np.random.randn(n_rows), "X2": np.random.randn(n_rows), "X3": np.random.randn(n_rows), "W": np.random.choice([10, 20], size=n_rows), "Y": np.random.choice([0, 0, 0, 0, 0, 10, 20, 30], size=n_rows) + 0.1 } train = h2o.H2OFrame(pd.DataFrame(data)) test = train.drop("W") print(train) h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=["X3"], weights_column="W", lambda_=0, bs=[2], tweedie_variance_power=1.5, tweedie_link_power=0) h2o_model.train(x=["X1", "X2"], y="Y", training_frame=train) print(h2o_model) predict_w = h2o_model.predict(train) # scoring without weight column predict = h2o_model.predict(test) # get train perf on a cloned frame (to avoid re-using cached metrics - force to recalculate) train_clone = h2o.H2OFrame(train.as_data_frame(use_pandas=True)) model_perf_on_train = h2o_model.model_performance(test_data=train_clone) # ditto on test test_clone = h2o.H2OFrame(test.as_data_frame(use_pandas=True)) model_perf_on_test = h2o_model.model_performance(test_data=test_clone) # should produce same frame pyunit_utils.compare_frames_local(predict_w, predict, prob=1, tol=1e-6) # Save the MOJO to a temporary file original_model_filename = tempfile.mkdtemp() original_model_filename = h2o_model.save_mojo(original_model_filename) # Load the model from the temporary file mojo_model = h2o.import_mojo(original_model_filename) predict_mojo_w = mojo_model.predict(train) predict_mojo = mojo_model.predict(test) # Both should produce same results as in-H2O models pyunit_utils.compare_frames_local(predict_mojo_w, predict, prob=1, tol=1e-6) pyunit_utils.compare_frames_local(predict_mojo, predict, prob=1, tol=1e-6) mojo_perf_on_train = mojo_model.model_performance(test_data=train_clone) assert abs(mojo_perf_on_train._metric_json["MSE"] - model_perf_on_train._metric_json["MSE"]) < 1e-6 mojo_perf_on_test = mojo_model.model_performance(test_data=test_clone) assert abs(mojo_perf_on_test._metric_json["MSE"] - model_perf_on_test._metric_json["MSE"]) < 1e-6
def buildModelCheckPredict(train_data, myy, gamX, family, searchLambda=False, stdardize=True): numKnots = [5,5,5] x=["C1","C2"] h2o_model = H2OGeneralizedAdditiveEstimator(family=family, gam_columns=gamX, scale = [0.1,0.1,0.1], num_knots=numKnots, lambda_search = searchLambda, standardize=stdardize) h2o_model.train(x=x, y=myy, training_frame=train_data) return h2o_model
def test_gam_beta_constraints(): h2o_data = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() bc = [] bc.append(["C1", 0.0, 0.5]) bc.append(["C13", 0.0, 0.7]) beta_constraints = h2o.H2OFrame(bc) beta_constraints.set_names(["names", "lower_bounds", "upper_bounds"]) y = "C21" x = ["C1", "C2", "C13"] numKnots = [5, 5, 5] h2o_model = H2OGeneralizedAdditiveEstimator( family='gaussian', gam_columns=["C11", "C12", "C13"], scale=[1, 1, 1], num_knots=numKnots, bs=[2, 2, 0], beta_constraints=beta_constraints, seed=12) h2o_model.train(x=x, y=y, training_frame=h2o_data) h2oCoeffs = h2o_model.coef() h2o_model2 = H2OGeneralizedAdditiveEstimator( family='gaussian', gam_columns=["C11", "C12", "C13"], scale=[1, 1, 1], num_knots=numKnots, bs=[2, 2, 0], beta_constraints=beta_constraints, seed=12) h2o_model2.train(x=x, y=y, training_frame=h2o_data) h2oCoeffs2 = h2o_model2.coef() keyNames = h2oCoeffs.keys() for kNames in keyNames: assert abs(h2oCoeffs[kNames]-h2oCoeffs2[kNames]) < 1e-6, \ "expected coefficients: {0}. actual coefficients: {1}".format(h2oCoeffs[kNames], h2oCoeffs2[kNames]) # check to make sure gam column coefficients are non-negative coef_dict = h2o_model.coef() coef_keys = coef_dict.keys() for key in coef_keys: if "_is_" in key: assert coef_dict[key] >= 0
def test_gam_model_predict(): covtype_df = h2o.import_file( pyunit_utils.locate("bigdata/laptop/covtype/covtype.full.csv")) train, valid = covtype_df.split_frame([0.9], seed=1234) #Prepare predictors and response columns covtype_X = covtype_df.col_names[: -1] #last column is Cover_Type, our desired response variable covtype_y = covtype_df.col_names[-1] # build model with cross validation and no validation dataset gam_multi = H2OGeneralizedAdditiveEstimator(family='multinomial', solver='IRLSM', gam_columns=["Slope"], scale=[0.0001], num_knots=[5], standardize=True, nfolds=2, fold_assignment='modulo', alpha=[0.9, 0.5, 0.1], lambda_search=True, nlambdas=5, max_iterations=3) gam_multi.train(covtype_X, covtype_y, training_frame=train) # build model with cross validation and with validation dataset gam_multi_valid = H2OGeneralizedAdditiveEstimator(family='multinomial', solver='IRLSM', gam_columns=["Slope"], scale=[0.0001], num_knots=[5], standardize=True, nfolds=2, fold_assignment='modulo', alpha=[0.9, 0.5, 0.1], lambda_search=True, nlambdas=5, max_iterations=3) gam_multi_valid.train(covtype_X, covtype_y, training_frame=train, validation_frame=valid) # model should yield the same coefficients in both case gam_multi_coef = gam_multi.coef() gam_multi_valid_coef = gam_multi_valid.coef() pyunit_utils.assertEqualCoeffDicts(gam_multi_coef['coefficients'], gam_multi_valid_coef['coefficients'])
def test_gam_effective_parameters(): h2o_data = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_data["C21"] = h2o_data["C21"].asfactor() gam = H2OGeneralizedAdditiveEstimator(family='binomial', gam_columns=["C11", "C12", "C13"], scale=[1, 1, 1], num_knots=[5, 6, 7], standardize=True, Lambda=[0], alpha=[0], max_iterations=3) gam.train(x=["C1", "C2"], y="C21", training_frame=h2o_data) assert gam.parms['solver']['input_value'] == 'AUTO' assert gam.parms['solver']['actual_value'] == "IRLSM" assert gam.parms['fold_assignment']['input_value'] == 'AUTO' assert gam.parms['fold_assignment']['actual_value'] is None try: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.algos.evaluate_auto_model_parameters", "false")) gam = H2OGeneralizedAdditiveEstimator( family='binomial', gam_columns=["C11", "C12", "C13"], scale=[1, 1, 1], num_knots=[5, 6, 7], standardize=True, Lambda=[0], alpha=[0], max_iterations=3) gam.train(x=["C1", "C2"], y="C21", training_frame=h2o_data) assert gam.parms['solver']['input_value'] == 'AUTO' assert gam.parms['solver']['actual_value'] == 'AUTO' assert gam.parms['fold_assignment']['input_value'] == 'AUTO' assert gam.parms['fold_assignment']['actual_value'] == 'AUTO' finally: h2o.rapids("(setproperty \"{}\" \"{}\")".format( "sys.ai.h2o.algos.evaluate_auto_model_parameters", "true"))
def test_gam_model_predict(): print("Checking model scoring for gaussian") h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/gaussian_20cols_10000Rows.csv")) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myY = "C21" model_test_data = h2o.import_file( pyunit_utils.locate("smalldata/gam_test/predictGaussianGAM1.csv")) buildModelCheckPredict(h2o_data, h2o_data, model_test_data, myY, ["C11", "C12", "C13"], 'gaussian') print("Checking model scoring for multinomial") h2o_data = h2o.import_file( pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() model_test_data = h2o.import_file( pyunit_utils.locate("smalldata/gam_test/predictMultinomialGAM1.csv")) buildModelCheckPredict(h2o_data, h2o_data, model_test_data, myY, ["C6", "C7", "C8"], 'multinomial') print("Checking model scoring for binomial") h2o_data = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myY = "C21" h2o_data["C21"] = h2o_data["C21"].asfactor() model_test_data = h2o.import_file( pyunit_utils.locate( "smalldata/gam_test/predictBinomialGAMRPython.csv")) buildModelCheckPredict(h2o_data, h2o_data, model_test_data, myY, ["C11", "C12", "C13"], 'binomial') print("gam coeff/varimp test completed successfully") # add fractional binomial just to make sure it runs print("Checking model scoring for fractionalbinomial") h2o_data = h2o.import_file( pyunit_utils.locate("smalldata/glm_test/binomial_20_cols_10KRows.csv")) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() h2o_model = H2OGeneralizedAdditiveEstimator( family="fractionalbinomial", gam_columns=["C11", "C12", "C13"], scale=[1, 1, 1], num_knots=[5, 5, 5], standardize=True, solver="irlsm") h2o_model.train(x=["C1", "C2"], y="C21", training_frame=h2o_data) predictTest = h2o_model.predict(h2o_data)