def test_gamma_dispersion_factor(): training_data = h2o.import_file( "http://h2o-public-test-data.s3.amazonaws.com/smalldata/glm_test/gamma_dispersion_factor_9_10kRows.csv" ) weight = pyunit_utils.random_dataset_real_only(training_data.nrow, 1, realR=2, misFrac=0, randSeed=12345) weight = weight.abs() training_data = training_data.cbind(weight) Y = 'resp' x = ['abs.C1.', 'abs.C2.', 'abs.C3.', 'abs.C4.', 'abs.C5.'] model_ml = H2OGeneralizedLinearEstimator(family='gamma', lambda_=0, compute_p_values=True, dispersion_factor_method="ml", weights_column="abs(C1)") model_ml.train(training_frame=training_data, x=x, y=Y) true_dispersion_factor = 9 R_dispersion_factor = 9.3 dispersion_factor_ml_estimated = model_ml._model_json["output"][ "dispersion"] print( "True dispersion parameter {0}. Estimated ml dispersion parameter {1}" ".".format(true_dispersion_factor, dispersion_factor_ml_estimated)) assert abs(true_dispersion_factor-dispersion_factor_ml_estimated) <= abs(R_dispersion_factor-true_dispersion_factor),\ "H2O dispersion parameter ml estimate {0} is worse than that of R {1}. True dispersion parameter is " \ "{2}".format( dispersion_factor_ml_estimated, R_dispersion_factor, true_dispersion_factor)
def glm_gamma_offset_mojo(): train = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) y = "DPROS" x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL"] x_offset = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "C1"] params = {'family': "gamma", 'offset_column': "C1"} offset = pyunit_utils.random_dataset_real_only(train.nrow, 1, realR=3, misFrac=0, randSeed=12345) train = train.cbind(offset) tmpdir = tempfile.mkdtemp() glm_gamma_model = pyunit_utils.build_save_model_generic( params, x, train, y, "glm", tmpdir) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glm_gamma_model._id) h2o.download_csv(train[x_offset], os.path.join( tmpdir, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glm_gamma_model, tmpdir, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(tmpdir, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10) # compare mojo and model predict
def random_dataset(nrow, ncol, realFrac=0.4, intFrac=0.3, enumFrac=0.3, factorR=10, integerR=100, responseFactor=1, misFrac=0.01, randSeed=None): fractions = dict() if (ncol == 1) and (realFrac >= 1.0): fractions[ "real_fraction"] = 1 # Right now we are dropping string columns, so no point in having them. fractions["categorical_fraction"] = 0 fractions["integer_fraction"] = 0 fractions["time_fraction"] = 0 fractions[ "string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] = 0 return h2o.create_frame(rows=nrow, cols=ncol, missing_fraction=misFrac, has_response=True, response_factors=responseFactor, integer_range=integerR, seed=randSeed, **fractions) real_part = pyunit_utils.random_dataset_real_only(nrow, (int)(realFrac * ncol), misFrac=misFrac, randSeed=randSeed) cnames = ['c_' + str(ind) for ind in range(real_part.ncol)] real_part.set_names(cnames) enumFrac = enumFrac + (1 - realFrac) / 2 intFrac = 1 - enumFrac fractions[ "real_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["categorical_fraction"] = enumFrac fractions["integer_fraction"] = intFrac fractions["time_fraction"] = 0 fractions[ "string_fraction"] = 0 # Right now we are dropping string columns, so no point in having them. fractions["binary_fraction"] = 0 df = h2o.create_frame(rows=nrow, cols=(ncol - real_part.ncol), missing_fraction=misFrac, has_response=True, response_factors=responseFactor, integer_range=integerR, seed=randSeed, **fractions) return real_part.cbind(df)
def gam_ordinal_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM = "multinomial" params = set_params() # set deeplearning model parameters df1 = pyunit_utils.random_dataset( PROBLEM, missing_fraction=0.001) # generate random dataset df = pyunit_utils.random_dataset_real_only(nrow=df1.nrow, ncol=3) df.set_names(["gam_col1", "gam_col2", "gam_col3"]) df = df1.cbind(df) dfnames = df.names # add GAM specific parameters params["gam_columns"] = [] params["scale"] = [] count = 0 num_gam_cols = 3 # maximum number of gam columns for cname in dfnames: if not (cname == 'response') and (str(df.type(cname)) == "real"): params["gam_columns"].append(cname) params["scale"].append(0.001) count = count + 1 if (count >= num_gam_cols): break train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() gamOrdinalModel = pyunit_utils.build_save_model_generic( params, x, train, "response", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamOrdinalModel._id) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( gamOrdinalModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging