def custom_distribution_mojo_test(): rows = 2000 df = random_dataset('binomial', verbose=False, NTESTROWS=rows) df['response'] = df['response'].asnumeric() train = df[rows:, :] test = df[:rows, :] x = list(set(df.names) - {"response"}) params = { 'ntrees': 10, 'max_depth': 4, 'distribution': "custom", 'custom_distribution_func': custom_distribution_bernoulli() } my_gbm = build_save_model_GBM(params, x, train, "response") mojo_name = getMojoName(my_gbm._id) tmp_dir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", mojo_name)) h2o.download_csv(test[x], os.path.join( tmp_dir, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = mojo_predict( my_gbm, tmp_dir, mojo_name) # load model and perform predict assert compare_frames_local( pred_h2o, pred_mojo, returnResult=True ), "Predictions from model and MOJO model are not the same."
def glm_binomial_mojo_pojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM = "binomial" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() glmBinomialModel = pyunit_utils.build_save_model_generic( params, x, train, "response", "glm", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmBinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def gam_gaussian_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM="gaussian" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001) # generate random dataset dfnames = df.names # add GAM specific parameters params["gam_columns"] = [] params["scale"] = [] count = 0 num_gam_cols = 3 # maximum number of gam columns for cname in dfnames: if not(cname == 'response') and (str(df.type(cname)) == "real"): params["gam_columns"].append(cname) params["scale"].append(0.001) count = count+1 if (count >= num_gam_cols): break train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression") # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def glm_multinomial_mojo_pojo(): PROBLEM = "multinomial" NTESTROWS = 200 params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmMultinomialModel = pyunit_utils.build_save_model_GLM( params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmMultinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def drf_mojo_reproducibility_info(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'max_depth': 4} drfModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "drf", tempfile.mkdtemp()) isinstance(drfModel._model_json['output']['reproducibility_information_table'][1]['h2o_cluster_uptime'][0], float) isinstance(drfModel._model_json['output']['reproducibility_information_table'][0]['java_version'][0], str) assert(drfModel._model_json['output']['reproducibility_information_table'][2]['input_frame'][0] == 'training_frame')
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transformN = "STANDARDIZE" # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame( glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file # test and make sure setting the iteration number did not screw up the prediction predID, pred_mojo = pyunit_utils.mojo_predict( glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict pred_h2o = h2o.get_frame("GLRMLoading_" + predID) print("Comparing mojo x Factor and model x Factor for 100 iterations") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) # scoring with 2 iterations should be shorter than scoring with 8000 iterations starttime = time.time() runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=8000) # save mojo predict time1000 = time.time() - starttime starttime = time.time() runMojoPredictOnly(TMPDIR, MOJONAME, glrmIterNumber=2) # save mojo predict time10 = time.time() - starttime print( "Time taken for 2 iterations: {0}s. Time taken for 8000 iterations: {1}s." .format(time10, time1000))
def drf_leaf_node_assignment_mojo_test(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] test = df[:TESTROWS, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'max_depth': 4} TMPDIR = tempfile.mkdtemp() my_gbm = pyunit_utils.build_save_model_generic(params, x, train, "response", "DRF", TMPDIR) MOJONAME = pyunit_utils.getMojoName(my_gbm._id) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True) # load model and perform predict pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def gbm_leaf_node_assignment_mojo_test(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] test = df[:TESTROWS, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'learn_rate': 0.1, 'max_depth': 4} my_gbm = pyunit_utils.build_save_model_GBM(params, x, train, "response") MOJONAME = pyunit_utils.getMojoName(my_gbm._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(my_gbm, TMPDIR, MOJONAME, get_leaf_node_assignment=True) # load model and perform predict pyunit_utils.compare_string_frames_local(pred_h2o, pred_mojo, 0.5)
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def pca_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", ncol_upper=8000, ncol_lower=5000, missing_fraction=0.001, seed=1234) train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = [ "NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE" ] # pyunit test loop through transform for transformN in transform_types: # compare H2O predict and mojo predict for all dataset transform types pcaModel = H2OPrincipalComponentAnalysisEstimator( k=3, transform=transformN, seed=1234, impute_missing=True, use_all_factor_levels=False) pcaModel.train(x=x, training_frame=train) pyunit_utils.saveModelMojo(pcaModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(pcaModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( pcaModel, TMPDIR, MOJONAME) # save mojo predict for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
def glm_multinomial_mojo_pojo(): PROBLEM="multinomial" NTESTROWS=200 params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmMultinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmMultinomialModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmMultinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def test_gbm_mangus(): train = pyunit_utils.random_dataset("regression") # generate random dataset test = train.drop("response") xname = list(set(train.names) - {"response"}) model_original = H2OGradientBoostingEstimator(ntrees=10, max_depth=2, col_sample_rate=0.8, sample_rate=0.7, stopping_rounds=3, seed=1234, score_tree_interval=10, learn_rate=0.1, stopping_metric="rmse") model_original.train(x=xname, y="response", training_frame=train) score_original_h2o = model_original.model_performance(test) print("H2O score on original test frame:") try: print(score_original_h2o) except: assert False, "Should not have failed here with empty model metrics message."
def gam_gaussian_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM = "gaussian" params = set_params() df = pyunit_utils.random_dataset(PROBLEM, seed=2, missing_fraction=0.5) dfnames = df.names # add GAM specific parameters params["gam_columns"] = [] params["scale"] = [] count = 0 num_gam_cols = 3 # maximum number of gam columns for cname in dfnames: if not (cname == 'response') and (str(df.type(cname)) == "real"): params["gam_columns"].append(cname) params["scale"].append(0.001) count = count + 1 if count >= num_gam_cols: break train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] exclude_list = {"response", params["gam_columns"][0]} x = list(set(df.names) - exclude_list) TMPDIR = tempfile.mkdtemp() gamGaussianModel = pyunit_utils.build_save_model_generic( params, x, train, "response", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id) h2o.download_csv(test, os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( gamGaussianModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10)
def test_gbm_mangus(): train = pyunit_utils.random_dataset( "regression") # generate random dataset test = train.drop("response") xname = list(set(train.names) - {"response"}) model_original = H2OGradientBoostingEstimator(ntrees=10, max_depth=2, col_sample_rate=0.8, sample_rate=0.7, stopping_rounds=3, seed=1234, score_tree_interval=10, learn_rate=0.1, stopping_metric="rmse") model_original.train(x=xname, y="response", training_frame=train) score_original_h2o = model_original.model_performance(test) print("H2O score on original test frame:") try: print(score_original_h2o) except: assert False, "Should not have failed here with empty model metrics message."
def gbm_mojo_reproducibility_info(): problems = ['binomial', 'multinomial', 'regression'] PROBLEM = problems[randint(0, (len(problems) - 1))] TESTROWS = 2000 df = pyunit_utils.random_dataset(PROBLEM, verbose=False, NTESTROWS=TESTROWS) train = df[TESTROWS:, :] x = list(set(df.names) - {"respose"}) params = {'ntrees': 50, 'learn_rate': 0.1, 'max_depth': 4} gbmModel = pyunit_utils.build_save_model_GBM(params, x, train, "response") isinstance( gbmModel._model_json['output']['reproducibility_information_table'][1] ['h2o_cluster_uptime'][0], float) isinstance( gbmModel._model_json['output']['reproducibility_information_table'][0] ['java_version'][0], str) assert (gbmModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][0] == 'training_frame') ecology = h2o.import_file( path=pyunit_utils.locate("smalldata/gbm_test/ecology_model.csv")) ecology['Angaus'] = ecology['Angaus'].asfactor() train, calib = ecology.split_frame(seed=12354) predictors = ecology.columns[3:13] w = h2o.create_frame(binary_fraction=1, binary_ones_fraction=0.5, missing_fraction=0, rows=744, cols=1) w.set_names(["weight"]) train = train.cbind(w) model = H2OGradientBoostingEstimator(ntrees=10, max_depth=5, min_rows=10, learn_rate=0.1, distribution="multinomial", weights_column="weight", calibrate_model=True, calibration_frame=calib) model.train(x=predictors, y="Angaus", training_frame=train) print("Downloading Java prediction model code from H2O") TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", model._id)) os.makedirs(TMPDIR) mojo_path = model.download_mojo(path=TMPDIR) gbmModel = h2o.upload_mojo(mojo_path=mojo_path) isinstance( gbmModel._model_json['output']['reproducibility_information_table'][1] ['h2o_cluster_uptime'][0], float) isinstance( gbmModel._model_json['output']['reproducibility_information_table'][0] ['java_version'][0], str) assert (gbmModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][0] == 'training_frame') assert (gbmModel._model_json['output']['reproducibility_information_table'] [2]['input_frame'][2] == 'calibration_frame')