def setup_grid(): h2o.remove_all() hyper_parameters = OrderedDict() hyper_parameters["learn_rate"] = [0.1, 0.05, 0.01] hyper_parameters["ntrees"] = [1, 3, 5] gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) return gs
def pca_pubdev_4167_OOM(): """ This pyunit is written to make sure PCA works with customer data. It is mainly used by customer to verify PCA operations and not to be used as a regular test since I do not want to expose customer data. """ h2o.remove_all() transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] # make sure we check all tranforms transformN = transform_types[randint(0, len(transform_types)-1)] print("transform used on dataset is {0}.\n".format(transformN)) training_data = h2o.import_file(path=pyunit_utils.locate("/Users/wendycwong/gitBackup/SDatasets/pubdev_4167_Avkash/m120K.tar")) # Nidhi: import may not work gramSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN) gramSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data) powerSVDPCA = H2OPCA(k=training_data.ncols, transform=transformN, pca_method="Power") powerSVDPCA.train(x=list(range(0, training_data.ncols)), training_frame=training_data) # compare singular values and stuff between power and GramSVD methods print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal(gramSVDPCA._model_json["output"]["importance"], powerSVDPCA._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1e-5, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(gramSVDPCA._model_json["output"]["eigenvectors"], powerSVDPCA._model_json["output"]["eigenvectors"], powerSVDPCA._model_json["output"]["names"], tolerance=1e-1, check_sign=True)
def grid_resume(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search ntrees_opts = [1,5] hyper_parameters = OrderedDict() hyper_parameters["ntrees"] = ntrees_opts print("GBM grid with the following hyper_parameters:", hyper_parameters) export_dir = pyunit_utils.locate("results") gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, export_checkpoints_dir=export_dir) gs.train(x=list(range(4)), y=4, training_frame=train) grid_id = gs.grid_id old_grid_model_count = len(gs.model_ids) print("Baseline grid has %d models" % old_grid_model_count) h2o.remove_all() train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) grid = h2o.load_grid(export_dir + "/" + grid_id) assert grid is not None assert len(grid.model_ids) == old_grid_model_count grid.train(x=list(range(4)), y=4, training_frame=train) assert len(grid.model_ids) == old_grid_model_count print("Newly grained grid has %d models" % len(grid.model_ids)) for model_id in grid.model_ids: model = h2o.get_model(model_id) assert model is not None
def deepwater_checkpoint(): if not H2ODeepWaterEstimator.available(): return ## build a model #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() print(frame.head(5)) model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0) model.train(y=1, training_frame=frame) ## save the model model_path = h2o.save_model(model) ## delete everything - simulate cluster shutdown and restart h2o.remove_all() ## reimport the model and the frame model = h2o.load_model(model_path) #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() ## delete the checkpoint file os.remove(model_path) ## continue training model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0,score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id) model2.train(y=1, training_frame=frame) model2.show()
def test_gam_transformed_frame_serialization(): h2o_data = h2o.import_file(path=pyunit_utils.locate( "smalldata/glm_test/multinomial_10_classes_10_cols_10000_Rows_train.csv" )) h2o_data["C1"] = h2o_data["C1"].asfactor() h2o_data["C2"] = h2o_data["C2"].asfactor() myX = ["C1", "C2"] myY = "C11" h2o_data["C11"] = h2o_data["C11"].asfactor() h2o_model = H2OGeneralizedAdditiveEstimator(family="multinomial", gam_columns=["C6", "C7", "C8"], keep_gam_cols=True, scale=[1, 1, 1], num_knots=[5, 5, 5]) h2o_model.train(x=myX, y=myY, training_frame=h2o_data) gam_frame = h2o.get_frame( h2o_model._model_json["output"]["gam_transformed_center_key"]) tmpdir = tempfile.mkdtemp() filename = os.path.join(tmpdir, "gamXFrame.csv") h2o.download_csv(gam_frame, filename) model_path = h2o.save_model(h2o_model, tmpdir) h2o.remove_all() loaded_model = h2o.load_model(model_path) gam_frame_loaded = h2o.get_frame( loaded_model._model_json["output"]["gam_transformed_center_key"]) gam_frame_original = h2o.import_file(filename) pyunit_utils.compare_frames_local(gam_frame_loaded[2:15], gam_frame_original[2:15], prob=1, tol=1e-6) print("Test completed.")
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression") # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def grid_resume(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search ntrees_opts = [1, 3] learn_rate_opts = [0.1, .05] hyper_parameters = OrderedDict() hyper_parameters["learn_rate"] = learn_rate_opts hyper_parameters["ntrees"] = ntrees_opts print("GBM grid with the following hyper_parameters:", hyper_parameters) export_dir = pyunit_utils.locate("results") gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) grid_id = gs.grid_id old_grid_model_count = len(gs.model_ids) print("Baseline grid has %d models" % old_grid_model_count) saved_path = h2o.save_grid(export_dir, grid_id) h2o.remove_all(); train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) grid = h2o.load_grid(saved_path) assert grid is not None assert len(grid.model_ids) == old_grid_model_count # Modify the hyperspace - should add new models to the grid hyper_parameters["ntrees"] = [2,5] grid = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters, grid_id = grid.grid_id) grid.train(x=list(range(4)), y=4, training_frame=train) print("Newly grained grid has %d models" % len(grid.model_ids)) assert len(grid.model_ids) == 2 * old_grid_model_count for model_id in grid.model_ids: model = h2o.get_model(model_id) assert model is not None
def deeplearning_mojo_pojo(): h2o.remove_all() params = set_params() # set deeplearning model parameters df = random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) try: deeplearningModel = build_save_model(params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-10) except Exception as ex: print("*************** ERROR and type is ") print(str(type(ex))) print(ex) if "AssertionError" in str(type(ex)): # only care if there is an AssertionError, ignore the others sys.exit(1)
def glrm_mojo(): h2o.remove_all() train = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_train.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_mojo_test.csv")) predict_10iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_10iter.csv")) predict_1iter = h2o.import_file(pyunit_utils.locate("smalldata/glrm_test/pubdev_5858_glrm_predict_1iter.csv")) x = train.names transformN = "STANDARDIZE" # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234, init="random") glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # save mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file # test and make sure setting the iteration number did not screw up the prediction predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=100) # save mojo predict pred_h2o = h2o.get_frame("GLRMLoading_"+predID) print("Comparing mojo x Factor and model x Factor for 100 iterations") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=1) # save mojo predict print("Comparing mojo x Factor and model x Factor for 1 iterations") pyunit_utils.compare_frames_local(predict_1iter, pred_mojo, 1, tol=1e-10) predID, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmIterNumber=10) # save mojo predict print("Comparing mojo x Factor and model x Factor for 10 iterations") pyunit_utils.compare_frames_local(predict_10iter, pred_mojo, 1, tol=1e-10)
def test_modelselection_backward_serialization(): d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) y = "GLEASON" x = ["ID","AGE","RACE","CAPSULE","DCAPS","PSA","VOL","DPROS"] # make sure duplicate runs produce same results model_backward = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5, lambda_=0, theta=0.01) model_backward.train(training_frame=d, x=x, y=y) model_backward2 = modelSelection(seed=12345, mode="backward", family='negativebinomial', link="log",alpha=0.5, lambda_=0, theta=0.01) model_backward2.train(training_frame=d, x=x, y=y) result = model_backward.result() # get result frame result2 = model_backward.result() # get result frame pyunit_utils.compare_frames_local(result[2:5], result2[2:5], prob=1.0) # compare result from both models and they should the same num_models = result.nrows # number of models built one_model = h2o.get_model(result["model_id"][num_models-1, 0]) predict_frame = one_model.predict(d) tmpdir = tempfile.mkdtemp() file_dir = os.path.join(tmpdir, "predict.csv") h2o.download_csv(predict_frame, file_dir) # save one scoring frame model_path_backward = model_backward.download_model(tmpdir) # store the model h2o.remove_all() d = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) loaded_backward_model = h2o.load_model(model_path_backward) result_frame_backward = loaded_backward_model.result() model_from_frame_backward = h2o.get_model(result_frame_backward["model_id"][num_models-1, 0]) pred_frame_backward = model_from_frame_backward.predict(d) pred_frame_model = h2o.import_file(file_dir) pyunit_utils.compare_frames_local(pred_frame_backward, pred_frame_model, prob=1.0)
def check_big_merge(): h2o.remove_all() nrow = 1000000 ncol = 2 iRange = 100000 frame1 = h2o.create_frame(rows=nrow, cols=ncol, integer_fraction=1, seed=12345, integer_range=iRange, missing_fraction=0.0) frame2 = h2o.create_frame(rows=nrow, cols=ncol, integer_fraction=1, seed=54321, integer_range=iRange, missing_fraction=0.0) frame1.set_names(["C1", "C2"]) frame2.set_names(["C1", "C3"]) mergedExact = frame1.merge(frame2, by_x=["C1"], by_y=["C1"], all_x=False, all_y=False) mergedLeft = frame1.merge(frame2, by_x=["C1"], by_y=["C1"], all_x=True) assert mergedExact.nrow < mergedLeft.nrow, "Expected row numbers are wrong"
def glm_binomial_mojo_pojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM = "binomial" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() glmBinomialModel = pyunit_utils.build_save_model_generic( params, x, train, "response", "glm", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glmBinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local( pred_h2o, pred_mojo, 0.1, tol=1e-10 ) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def test_anovaglm_serialization(): train = h2o.import_file(path=pyunit_utils.locate( "smalldata/prostate/prostate_complete.csv.zip")) y = 'CAPSULE' x = ['AGE', 'VOL', 'DCAPS'] train[y] = train[y].asfactor() anovaglm_model = anovaglm(family='binomial', lambda_=0, missing_values_handling="skip") anovaglm_model.train(x=x, y=y, training_frame=train) tmpdir = tempfile.mkdtemp() model_path = anovaglm_model.download_model(tmpdir) result_frame_filename = os.path.join(tmpdir, "result_frame.csv") h2o.download_csv(anovaglm_model.result(), result_frame_filename) h2o.remove_all() result_frame_original = h2o.import_file(result_frame_filename) loaded_anovaglm_model = h2o.load_model(model_path) result_frame_loaded = loaded_anovaglm_model.result() for cind in list(range(0, result_frame_original.ncols)): for rind in list(range(0, result_frame_original.nrows)): if result_frame_original.type(cind) == 'real': assert abs(result_frame_original[rind, cind]-result_frame_loaded[rind, cind]) < 1e-6, \ "Expected: {0}. Actual: {1}".format(result_frame_original[rind, cind], result_frame_loaded[rind, cind]) else: assert result_frame_original[rind, cind]==result_frame_loaded[rind, cind], \ "Expected: {0}. Actual: {1}".format(result_frame_original[rind, cind], result_frame_loaded[rind, cind])
def grid_export_with_cv(): train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Run GBM Grid Search hyper_parameters = OrderedDict() hyper_parameters["ntrees"] = [1, 2] # train with CV gs = H2OGridSearch(H2OGradientBoostingEstimator(nfolds=2, keep_cross_validation_predictions=True, seed=42), hyper_params=hyper_parameters) gs.train(x=list(range(4)), y=4, training_frame=train) holdout_frame_ids = map(lambda m: m.cross_validation_holdout_predictions().frame_id, gs.models) export_dir = pyunit_utils.locate("results") saved_path = h2o.save_grid(export_dir, gs.grid_id, export_cross_validation_predictions=True) h2o.remove_all() grid = h2o.load_grid(saved_path) assert grid is not None for holdout_frame_id in holdout_frame_ids: assert h2o.get_frame(holdout_frame_id) is not None train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) stack = H2OStackedEnsembleEstimator(base_models=grid.model_ids) stack.train(x=list(range(4)), y=4, training_frame=train) predicted = stack.predict(train) assert predicted.nrow == train.nrow
def gam_gaussian_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM="gaussian" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM, missing_fraction=0.001) # generate random dataset dfnames = df.names # add GAM specific parameters params["gam_columns"] = [] params["scale"] = [] count = 0 num_gam_cols = 3 # maximum number of gam columns for cname in dfnames: if not(cname == 'response') and (str(df.type(cname)) == "real"): params["gam_columns"].append(cname) params["scale"].append(0.001) count = count+1 if (count >= num_gam_cols): break train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) TMPDIR = tempfile.mkdtemp() gamGaussianModel = pyunit_utils.build_save_model_generic(params, x, train, "response", "gam", TMPDIR) # build and save mojo model MOJONAME = pyunit_utils.getMojoName(gamGaussianModel._id) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(gamGaussianModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging
def test_frame_reload(): work_dir = tempfile.mkdtemp() iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) df_key = iris.key df_pd_orig = iris.as_data_frame() iris.save(work_dir) try: iris.save(work_dir, force=False) # fails because file exists except H2OResponseError as e: assert e.args[0].exception_msg.startswith("File already exists") try: h2o.load_frame(df_key, work_dir, force=False) # fails because frame exists except H2OResponseError as e: assert e.args[ 0].exception_msg == "Frame Key<Frame> iris_wheader.hex already exists." df_loaded_force = h2o.load_frame(df_key, work_dir) h2o.remove(iris) df_loaded = h2o.load_frame(df_key, work_dir, force=False) df_pd_loaded_force = df_loaded_force.as_data_frame() df_pd_loaded = df_loaded.as_data_frame() assert df_pd_orig.equals(df_pd_loaded_force) assert df_pd_orig.equals(df_pd_loaded) # try running grid search on the frame h2o.remove_all() df_loaded = h2o.load_frame(df_key, work_dir) hyper_parameters = OrderedDict() hyper_parameters["ntrees"] = [5, 10, 20, 30] grid_small = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params=hyper_parameters) grid_small.train(x=list(range(4)), y=4, training_frame=df_loaded) assert len(grid_small.models) == 4
def check_story(story_name, paragraphs): h2o.remove_all() h2o.log_and_echo("------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("CHECKING: {0}".format(story_name)) h2o.log_and_echo("") h2o.log_and_echo("------------------------------------------------------------") # 1. Combine the related, individual code paragraphs into a single, coherent python story story = [] for p in paragraphs: with open(p, "r") as f: story = story + f.readlines() # 2. Execute the story # first, remove any h2o.init calls remove_lines = [] for idx, l in enumerate(story): if "h2o.init" in l: remove_lines.append(idx) story = [i for j, i in enumerate(story) if j not in remove_lines] # write the story that will be executed to the results directory for future reference story_file = os.path.join(results_dir(), test_name()+"."+story_name+".code") with open(story_file, 'w') as f: f.writelines(story) # run it with open(story_file, "r") as s: booklet = s.read() booklet_c = compile(booklet, '<string>', 'exec') p = {} exec(booklet_c, p)
def save_load_mode_with_cv(): prostate = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_gbm = H2OGradientBoostingEstimator( nfolds=2, keep_cross_validation_predictions=True) prostate_gbm.train(x=["AGE", "RACE", "PSA", "DCAPS"], y="CAPSULE", training_frame=prostate) path = pyunit_utils.locate("results") model_path = h2o.save_model(prostate_gbm, path=path, force=True, export_cross_validation_predictions=True) assert os.path.isfile( model_path ), "Expected model artifact {0} to exist, but it does not.".format( model_path) h2o.remove_all() prostate_gbm_reloaded = h2o.load_model(model_path) assert isinstance(prostate_gbm_reloaded, H2OGradientBoostingEstimator), \ "Expected H2OGradientBoostingEstimator, but got {0}".format(prostate_gbm_reloaded) holdout_frame_id = prostate_gbm.cross_validation_holdout_predictions( ).frame_id assert h2o.get_frame(holdout_frame_id) is not None
def deeplearning_mojo_pojo(): h2o.remove_all() params = set_params() # set deeplearning model parameters df = random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) try: deeplearningModel = build_save_model( params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict # pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-6) # print("Comparing pojo predict and h2o predict...") # pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-6) except Exception as ex: print("*************** ERROR and type is ") print(str(type(ex))) print(ex) if "AssertionError" in str( type(ex) ): # only care if there is an AssertionError, ignore the others sys.exit(1)
def train_with_h2o_full_data(time): import h2o from h2o.automl import H2OAutoML h2o.init() h2o.remove_all() raw_train_df = h2o.import_file( path='data/train_dataset/train_dataset_temp.csv') raw_test_df = h2o.import_file( path='data/test_dataset/test_dataset_temp.csv') train_df = raw_train_df[:, 1:] y = 'var_29' col_list = train_df.columns print(col_list) x = col_list[:-1] print('x:', x) print('y:', y) splits = train_df.split_frame(ratios=[0.9], seed=1) train = splits[0] test = splits[1] #part 50000行 # aml1 = H2OAutoML(max_runtime_secs=time,balance_classes=False,stopping_tolerance=0.005,stopping_rounds=50,sort_metric='MAE',stopping_metric='MAE',seed=2019, project_name="part_data_train") # aml1.train(x=x,y=y,training_frame=train,leaderboard_frame=test) #full data train aml2 = H2OAutoML(max_runtime_secs=time, balance_classes=False, stopping_tolerance=0.005, stopping_rounds=50, sort_metric='MAE', stopping_metric='MAE', seed=2019, project_name="full_data_train") aml2.train(x=x, y=y, training_frame=train_df) # path1=h2o.save_model(model=aml1,path='models',force=True) # path2=h2o.save_model(model=aml2,path='models',force=True) # print(aml1.leaderboard) print('++++++++++++++++++++++') print(aml2.leaderboard) # ans1=aml1.predict(raw_test_df[:,1:]) ans2 = aml2.predict(raw_test_df[:, 1:]) # print(ans1) print(ans2) # ans1=ans1.as_data_frame() ans2 = ans2.as_data_frame() # ans1.to_csv('data/ans1.csv',index=False) ans2.to_csv('data/ans2_time_{}.csv'.format(str(time)), index=False) # res1=pd.DataFrame() temp = pd.read_csv('data/test_dataset/test_dataset_temp.csv') # res1['id']=temp['var_0'] # res1['score']=ans1.values # res1.to_csv('data/h2o_pred_submission_v1.csv',index=False) res2 = pd.DataFrame() res2['id'] = temp['var_0'] res2['score'] = ans2.values res2['score'] = res2['score'].apply(lambda x: int(x) if (x - int(x)) < 0.5 else int(x) + 1) res2.to_csv('data/h2o_pred_submission_int_v2_time_{}.csv'.format(time), index=False)
def impute_missing_values(data_frame, columns): if not isinstance(data_frame, pd.DataFrame): return if not isinstance(columns, list): return # Impute summary impute_summary = {} # result frame result_frame = pd.DataFrame(data_frame) # Start h2o server h2o.init(max_mem_size_GB=5) for column in columns: print "Processing :", column # Defining columns response_column = column training_columns = list(data_frame.columns) training_columns.remove(response_column) # Creating h2o frame training_frame = h2o.H2OFrame(data_frame) training_frame.set_names(list(data_frame.columns)) # Defining model model = H2ORandomForestEstimator(ntrees=75, max_depth=25, nbins=25, binomial_double_trees=True, nfolds=10) model.train(x=training_columns, y=response_column, training_frame=training_frame) # Predict values predictions = model.predict(test_data=training_frame) predictions = list(map(float, h2OColumnToList(predictions))) # Add predictions to the result frame result_frame[column] = predictions actual = data_frame[column] predicted = result_frame[column] rmse = sqrt(mean_squared_error(actual, predicted)) impute_summary[column] = ('RMSE', rmse) # Removing all processes h2o.remove_all() # Displaying impute summary for key in impute_summary: print impute_summary[key], key return result_frame
def test_stacked_ensemble_is_able_to_use_imported_base_models(): import tempfile, shutil, glob train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv")) test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv")) x = train.columns y = "species" x.remove(y) nfolds = 2 gbm = H2OGradientBoostingEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) gbm.train(x=x, y=y, training_frame=train) drf = H2ORandomForestEstimator(nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True) drf.train(x=x, y=y, training_frame=train) se = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, drf.model_id]) se.train(x=x, y=y, training_frame=train) assert len(se.base_models) == 2 TMP_DIR = tempfile.mkdtemp() try: h2o.save_model(gbm, TMP_DIR + "/gbm.model") h2o.save_model(drf, TMP_DIR + "/drf.model") gbm_holdout_id = gbm.cross_validation_holdout_predictions().frame_id drf_holdout_id = drf.cross_validation_holdout_predictions().frame_id h2o.export_file(gbm.cross_validation_holdout_predictions(), TMP_DIR + "/gbm.holdout") h2o.export_file(drf.cross_validation_holdout_predictions(), TMP_DIR + "/drf.holdout") h2o.remove_all() h2o.import_file(TMP_DIR + "/gbm.holdout", gbm_holdout_id) h2o.import_file(TMP_DIR + "/drf.holdout", drf_holdout_id) gbm = h2o.upload_model(glob.glob(TMP_DIR + "/gbm.model/*")[0]) drf = h2o.upload_model(glob.glob(TMP_DIR + "/drf.model/*")[0]) train = h2o.import_file(pu.locate("smalldata/iris/iris_train.csv"), "some_other_name_of_training_frame") test = h2o.import_file(pu.locate("smalldata/iris/iris_test.csv"), "some_other_name_of_test_frame") x = train.columns y = "species" x.remove(y) se_loaded = H2OStackedEnsembleEstimator(training_frame=train, validation_frame=test, base_models=[gbm.model_id, drf.model_id]) se_loaded.train(x=x, y=y, training_frame=train) assert len(se_loaded.base_models) == 2 finally: shutil.rmtree(TMP_DIR)
def validacion_r(modelo, hyper_parameters, datos, variables, semilla=1234): h2o.init(max_mem_size=14) train = h2o.H2OFrame(datos[0]) tipificar_h2o(train) splits = train.split_frame(ratios=[0.7], seed=semilla) gs = H2OGridSearch(modelo, hyper_params=hyper_parameters) gs.train(x=variables, y="Tendencia", training_frame=splits[0]) resultados=procesamiento_resultados_binario(gs,splits,datos) h2o.remove_all() return(resultados)
def deeplearning_mojo_pojo(): h2o.remove_all() problemtypes = ["regression", "binomial", "multinomial"] autoEncoderOn = [True, False] for encoderOn in autoEncoderOn: for problem in problemtypes: print("AutoEncoderOn is: {0} and problem type is: {1}".format(encoderOn, problem)) random.seed(9876) # set python random seed runComparisonTests(encoderOn, problem)
def h2o_test_setup(sys_args): h2o_py_dir = os.path.realpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")) h2o_docs_dir = os.path.realpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "h2o-docs")) parse_args(sys_args) sys.path.insert(1, h2o_py_dir) import h2o from tests import pyunit_utils, pydemo_utils, pybooklet_utils for pkg in (pyunit_utils, pybooklet_utils): setattr(pkg, '__on_hadoop__', _ON_HADOOP_) setattr(pkg, '__hadoop_namenode__', _HADOOP_NAMENODE_) setattr(pkg, '__test_name__', _TEST_NAME_) setattr(pkg, '__results_dir__', _RESULTS_DIR_) if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_: pass else: raise ( EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: " "{0}".format(_TEST_NAME_)) print("[{0}] {1}\n".format( strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_))) auth = None if _LDAP_USER_NAME_ is not None and _LDAP_PASSWORD_ is not None: auth = (_LDAP_USER_NAME_, _LDAP_PASSWORD_) h2o.connect(ip=_H2O_IP_, port=_H2O_PORT_, verbose=False, auth=auth) h2o.utils.config.H2OConfigReader.get_config( )["general.allow_breaking_changes"] = True #rest_log = os.path.join(_RESULTS_DIR_, "rest.log") #h2o.start_logging(rest_log) #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log)) h2o.log_and_echo( "------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_) h2o.log_and_echo("") h2o.log_and_echo( "------------------------------------------------------------") h2o.remove_all() if _IS_IPYNB_: pydemo_utils.ipy_notebook_exec(_TEST_NAME_) elif _IS_PYUNIT_: pyunit_utils.pyunit_exec(_TEST_NAME_) elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_) elif _IS_PYDEMO_: pydemo_utils.pydemo_exec(_TEST_NAME_)
def standalone_test(test): h2o.init(strict_version_check=False) h2o.remove_all() h2o.log_and_echo("------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST") h2o.log_and_echo("") h2o.log_and_echo("------------------------------------------------------------") test()
def sort(): h2o.remove_all() df = h2o.import_file( pyunit_utils.locate( "bigdata/laptop/jira/PUBDEV_6829_srot_bug_bigKey_part.csv.zip")) t1 = time.time() df1 = df.sort([1]) assert df1[0, 1] <= df1[1, 1], "Test failed: Sort bug." print("Time taken to perform sort is {0}".format(time.time() - t1)) pyunit_utils.check_sorted_1_column(df1, 1, prob=0.00001, ascending=True) # check some rows
def deepwater_lenet(): if not H2ODeepWaterEstimator.available(): return frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) print(frame.head(5)) model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-3, network='lenet', score_interval=0, train_samples_per_iteration=1000) model.train(x=[0],y=1, training_frame=frame) model.show() error = model.model_performance(train=True).mean_per_class_error() h2o.remove_all() assert error < 0.1, "mean classification error is too high : " + str(error)
def test_rbind_summary(): h2o.remove_all() df = h2o.H2OFrame([1, 2, 5.5], destination_frame="df") # original frame dfr = h2o.H2OFrame([5.5, 1, 2], destination_frame="dfr") # reversed row content df1 = df[2, :] df2 = df[:2, :] summary = df1.summary(return_data=True) df3 = df1.rbind(df2) # fixed df3r = df2.rbind(df1) compareFramesLocal(dfr, df3) # should contain 5.5, 1, 2 compareFramesLocal(df, df3r) # should contain 1,2,5.5 df1 = df[ 3, :] # this will result in an NA since we do not have 4 rows in df. dfr[0, 0] = float('nan') df4 = df1.rbind(df2) compareFramesLocal(df4, dfr) # should contain NA, 1, 2 # performing the same test with an additionl categorical column per Michalk request. h2o.remove_all() df = h2o.H2OFrame([[1, "a"], [2, "b"], [5.5, "c"]], destination_frame="dfc") # original frame df[1] = df[1].asfactor() dfr = h2o.H2OFrame([[5.5, "c"], [1, "a"], [2, "b"]], destination_frame="dfrc") # reversed row content dfr[1] = df[1].asfactor( ) # this somehow switch the row content of the factor column to be alphabetical dfr[0, 1] = 'c' dfr[1, 1] = 'a' dfr[2, 1] = 'b' df1 = df[2, :] df2 = df[:2, :] summary = df1.summary(return_data=True) df3 = df1.rbind(df2) # fixed df3r = df2.rbind(df1) compareFramesLocal(dfr, df3) # should contain 5.5, 1, 2 compareFramesLocal(df, df3r) # should contain 1,2,5.5 # copying test from Michalk df1 = h2o.H2OFrame([[1, "a"], [2, "b"]]) df1[1] = df1[1].asfactor() df2 = h2o.H2OFrame([[2.2, "b"], [1.1, "a"]]) df2[1] = df2[1].asfactor() print(df1.summary()) print(df2.summary()) df3 = df1.rbind(df2) assert df3.nrow==(df1.nrow+df2.nrow), "Expected rbind rows: {0}, actual rows: " \ "{1}".format(df1.nrow+df2.nrow, df3.nrow)
def h2o_test_setup(sys_args): h2o_py_dir = os.path.realpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")) h2o_docs_dir = os.path.realpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "h2o-docs")) parse_args(sys_args) sys.path.insert(1, h2o_py_dir) import h2o from tests import pyunit_utils, pydemo_utils, pybooklet_utils set_pyunit_pkg_attrs(pyunit_utils) set_pybooklet_pkg_attrs(pybooklet_utils) if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_: pass else: raise ( EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: " "{0}".format(_TEST_NAME_)) print("[{0}] {1}\n".format( strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_))) h2o.init(ip=_H2O_IP_, port=_H2O_PORT_, strict_version_check=False, force_connect=_FORCE_CONNECT_) h2o.utils.config.H2OConfigReader.get_config( )["general.allow_breaking_changes"] = True #rest_log = os.path.join(_RESULTS_DIR_, "rest.log") #h2o.start_logging(rest_log) #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log)) h2o.log_and_echo( "------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_) h2o.log_and_echo("") h2o.log_and_echo( "------------------------------------------------------------") h2o.remove_all() if _IS_IPYNB_: pydemo_utils.ipy_notebook_exec(_TEST_NAME_) elif _IS_PYUNIT_: pyunit_utils.pyunit_exec(_TEST_NAME_) elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_) elif _IS_PYDEMO_: pydemo_utils.pydemo_exec(_TEST_NAME_)
def javamunge_assembly(): h2o.remove_all() train = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv") test = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3b.csv") # lending-club munging assembly print("Import and Parse data") # Add "earliest_cr_line" and "issue_d" and cast as strings to aide Cliff's PR on 7/13 types = {"int_rate": "string", "revol_util": "string", "emp_length": "string", "earliest_cr_line": "string", "issue_d": "string", "last_credit_pull_d": "factor"} data = h2o.import_file(path=train, col_types=types) test = h2o.import_file(path=test, col_types=data.types) ## use the same data types as the training set for the test set test = test[[1,5,19,23,45,66,99,590,8903,9999,10001,23892,23893,50123],:] test = h2o.assign(test,"test") assembly = H2OAssembly( steps=[ # munge int_rate column in place # strip %, trim ws, convert to double ("intrate_rm_junk_char", H2OColOp(op=H2OFrame.gsub, col="int_rate", inplace=True, pattern="%", replacement="")), # strip % ("intrate_trim_ws", H2OColOp(op=H2OFrame.trim, col="int_rate", inplace=True)), # trim ws ("intrate_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="int_rate", inplace=True)), # string -> double # munge the revol_util in the same way as the int_rate column ("revol_rm_junk_char", H2OColOp(op=H2OFrame.gsub, col="revol_util", inplace=True, pattern="%", replacement="")), # strip % ("revol_trim_ws", H2OColOp(op=H2OFrame.trim, col="revol_util", inplace=True)), # trim ws ("revol_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="revol_util", inplace=True)), # string -> double # munge earliest_cr_line column (mm-YYYY format) # split into Month and Year columns ("earliest_cr_line_split", H2OColOp(H2OFrame.strsplit, col="earliest_cr_line", inplace=False, new_col_name=["earliest_cr_line_Month","earliest_cr_line_Year"], pattern="-")), # split on '-' ("earliest_cr_line_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="earliest_cr_line_Year", inplace=True)), # string -> double # munge issue_d column in same way as earliest_cr_line column ("issue_date_split", H2OColOp(op=H2OFrame.strsplit, col="issue_d", inplace=False, new_col_name=["issue_d_Month", "issue_d_Year"], pattern="-")), # split on '-' ("issue_d_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="issue_d_Year", inplace=True)), # string -> double # do some munging of the emp_length column ("emp_length_rm_years", H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="([ ]*+[a-zA-Z].*)|(n/a)", replacement="")), # remove " year" and " years", also translate n/a to "" ("emp_length_trim", H2OColOp(op=H2OFrame.trim, col="emp_length", inplace=True)), # trim all the WS off ("emp_length_lt1_point5",H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="< 1", replacement="0.5")), # translate < 1 => 0.5 ("emp_length_10plus", H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="10\\+", replacement="10")), # translate 10+ to 10 ("emp_length_as_numeric",H2OColOp(op=H2OFrame.asnumeric, col="emp_length", inplace=True)), # string -> double # compute credit length ("credit_length", H2OBinaryOp(op=H2OAssembly.minus, col="issue_d_Year",inplace=False, new_col_name="longest_credit_length",right=H2OCol("earliest_cr_line_Year"))) ]) res = assembly.fit(data) pyunit_utils.javamunge(assembly, "AssemblyMungingDemoPojo", test)
def __ml_train(X, extra_crispr_df, y, train_index, test_index): logger.debug("Creating h2o working environment") # ### Start H2O # Start up a 1-node H2O cloud on your local machine, and allow it to use all CPU cores and up to 2GB of memory: h2o.init(max_mem_size="2G") h2o.remove_all() logger.debug("Created h2o working environment successfully") from h2o.estimators import H2ORandomForestEstimator rf_crispr = H2ORandomForestEstimator(model_id="rf_crispr", categorical_encoding="enum", nfolds=5, ntrees=30, stopping_rounds=30, score_each_iteration=True, seed=10) seq_data = X.iloc[:, :config.seq_len] seq_data.columns = ['pos_' + str(i) for i in range(len(seq_data.columns))] pre_h2o_df = pd.concat([seq_data, extra_crispr_df, y], axis=1) h2o_crispr_df_train = h2o.H2OFrame(pre_h2o_df.loc[train_index, :]) h2o_crispr_df_test = h2o.H2OFrame(pre_h2o_df.loc[test_index, :]) logger.debug("Training machine learning model") rf_crispr.train(x=h2o_crispr_df_train.col_names[:-1], y=h2o_crispr_df_train.col_names[-1], training_frame=h2o_crispr_df_train) logger.debug("Trained successfully. Output feature importance") feature_importance = rf_crispr._model_json['output'][ 'variable_importances'].as_data_frame()[['variable', 'percentage']] feature_importance.to_csv(config.feature_importance_path, index=False) logger.debug("Predicting training data") test_prediction_train = rf_crispr.predict(h2o_crispr_df_train[:-1]) performance = spearmanr(test_prediction_train.as_data_frame()['predict'], h2o_crispr_df_train.as_data_frame()['log2fc'])[0] logger.debug( "spearman correlation coefficient for training dataset is: %f" % performance) logger.debug("Predicting test data") test_prediction = rf_crispr.predict(h2o_crispr_df_test[:-1]) performance = spearmanr(test_prediction.as_data_frame()['predict'], h2o_crispr_df_test.as_data_frame()['log2fc'])[0] logger.debug("spearman correlation coefficient for test dataset is: %f" % performance) logger.debug("Saving model") h2o.save_model(rf_crispr, config.ml_model_path) logger.debug("Saved model to disk")
def init(): global sr, fr, share_cols # <hack> regarding output to make h2o work in IDLE class PseudoTTY(object): def __init__(self, underlying): underlying.encoding = 'cp437' self.__underlying = underlying def __getattr__(self, name): return getattr(self.__underlying, name) def isatty(self): return True import sys sys.stdout = PseudoTTY(sys.stdout) # </hack> h2o.init(nthreads=-1, max_mem_size="58G") h2o.remove_all() init() femq12 = pd.read_csv( r"H:\Ashwin\dta\features\All_return_features_sample.csv") # femq12['fold'] = (femq12['TIN_hash_byte']/32).astype(int) fr = h2o.H2OFrame(python_obj=femq12) print 'setting factors...' fr = set_return_factors(fr) fr = set_profile_factors(fr) fr = set_match_factors(fr) fr = set_transaction_factors(fr) fr = set_purchasenetwork_factors(fr) fr = set_salenetwork_factors(fr) fr = set_downstream_factors(fr) return fr fr['Missing_SalesDSUnTaxProp'] = fr['Missing_SalesDSUnTaxProp'].asfactor() fr['Missing_SalesDSCreditRatio'] = fr[ 'Missing_SalesDSCreditRatio'].asfactor() fr['Missing_SalesDSVatRatio'] = fr['Missing_SalesDSVatRatio'].asfactor() fr['Missing_MaxSalesProp'] = fr['Missing_MaxSalesProp'].asfactor() fr['Missing_MaxPurchaseProp'] = fr['Missing_MaxPurchaseProp'].asfactor() fr['Missing_PurchaseDSUnTaxProp'] = fr[ 'Missing_PurchaseDSUnTaxProp'].asfactor() fr['Missing_PurchaseDSCreditRatio'] = fr[ 'Missing_PurchaseDSCreditRatio'].asfactor() fr['Missing_PurchaseDSVatRatio'] = fr[ 'Missing_PurchaseDSVatRatio'].asfactor()
def __init__(self, model, X_test, feature_names, max_depth, model_id='surrogate_mojo'): self.model = model self.X_test = np.array(X_test) self.y_test = np.array(model.predict(X_test)) self.feature_names = feature_names self.max_depth = max_depth self.model_id = model_id h2o.init(max_mem_size='2G') # start h2o h2o.remove_all() # remove any existing data structures from h2o memory
def javamunge_assembly(): h2o.remove_all() train = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3a.csv") test = pyunit_utils.locate("bigdata/laptop/lending-club/LoanStats3b.csv") # lending-club munging assembly print("Import and Parse data") types = {"int_rate":"String", "revol_util":"String", "emp_length":"String"} data = h2o.import_file(path=train, col_types=types) test = h2o.import_file(path=test, col_types=types) test = test[[1,5,19,23,45,66,99,590,8903,9999,10001,23892,23893,50123],:] test = h2o.assign(test,"test") assembly = H2OAssembly( steps=[ # munge int_rate column in place # strip %, trim ws, convert to double ("intrate_rm_junk_char", H2OColOp(op=H2OFrame.gsub, col="int_rate", inplace=True, pattern="%", replacement="")), # strip % ("intrate_trim_ws", H2OColOp(op=H2OFrame.trim, col="int_rate", inplace=True)), # trim ws ("intrate_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="int_rate", inplace=True)), # string -> double # munge the revol_util in the same way as the int_rate column ("revol_rm_junk_char", H2OColOp(op=H2OFrame.gsub, col="revol_util", inplace=True, pattern="%", replacement="")), # strip % ("revol_trim_ws", H2OColOp(op=H2OFrame.trim, col="revol_util", inplace=True)), # trim ws ("revol_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="revol_util", inplace=True)), # string -> double # munge earliest_cr_line column (mm-YYYY format) # split into Month and Year columns ("earliest_cr_line_split", H2OColOp(H2OFrame.strsplit, col="earliest_cr_line", inplace=False, new_col_name=["earliest_cr_line_Month","earliest_cr_line_Year"], pattern="-")), # split on '-' ("earliest_cr_line_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="earliest_cr_line_Year", inplace=True)), # string -> double # munge issue_d column in same way as earliest_cr_line column ("issue_date_split", H2OColOp(op=H2OFrame.strsplit, col="issue_d", inplace=False, new_col_name=["issue_d_Month", "issue_d_Year"], pattern="-")), # split on '-' ("issue_d_Year_as_numeric", H2OColOp(op=H2OFrame.asnumeric, col="issue_d_Year", inplace=True)), # string -> double # do some munging of the emp_length column ("emp_length_rm_years", H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="([ ]*+[a-zA-Z].*)|(n/a)", replacement="")), # remove " year" and " years", also translate n/a to "" ("emp_length_trim", H2OColOp(op=H2OFrame.trim, col="emp_length", inplace=True)), # trim all the WS off ("emp_length_lt1_point5",H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="< 1", replacement="0.5")), # translate < 1 => 0.5 ("emp_length_10plus", H2OColOp(op=H2OFrame.gsub, col="emp_length", inplace=True, pattern="10\\+", replacement="10")), # translate 10+ to 10 ("emp_length_as_numeric",H2OColOp(op=H2OFrame.asnumeric, col="emp_length", inplace=True)), # string -> double # compute credit length ("credit_length", H2OBinaryOp(op=H2OAssembly.minus, col="issue_d_Year",inplace=False, new_col_name="longest_credit_length",right=H2OCol("earliest_cr_line_Year"))) ]) res = assembly.fit(data) pyunit_utils.javamunge(assembly, "AssemblyMungingDemoPojo", test)
def setup_and_train(param_enabled=None): h2o.remove_all() target, train, _, _ = prepare_data() state = 'enabled' if param_enabled is True else 'disabled' if param_enabled is False else 'default' if param_enabled is None: aml = H2OAutoML(project_name='keep_cross_validation_predictions_'+state, nfolds=nfolds, max_models=3, seed=1) else: aml = H2OAutoML(project_name='keep_cross_validation_predictions_'+state, nfolds=nfolds, max_models=8, seed=1, keep_cross_validation_predictions=param_enabled) aml.train(y=target, training_frame=train) # print(aml.leaderboard) return aml
def run_test(sys_args, test_to_run): global _IPYNB_ parse_args(sys_args) h2o.init(ip=_H2O_IP_, port=_H2O_PORT_, strict_version_check=False) h2o.log_and_echo("------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST: "+str(h2o.ou())) h2o.log_and_echo("") h2o.log_and_echo("------------------------------------------------------------") # num_keys = h2o.store_size() try: if _IPYNB_: utils.ipy_notebook_exec(_IPYNB_, save_and_norun=False) else: test_to_run() finally: h2o.remove_all()
def h2o_test_setup(sys_args): h2o_py_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..")) h2o_docs_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","h2o-docs")) parse_args(sys_args) sys.path.insert(1, h2o_py_dir) import h2o from tests import pyunit_utils, pydemo_utils, pybooklet_utils for pkg in (pyunit_utils, pybooklet_utils): setattr(pkg, '__on_hadoop__', _ON_HADOOP_) setattr(pkg, '__hadoop_namenode__', _HADOOP_NAMENODE_) setattr(pkg, '__test_name__', _TEST_NAME_) setattr(pkg, '__results_dir__', _RESULTS_DIR_) if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_ or _IS_PYDEMO_: pass else: raise(EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: " "{0}".format(_TEST_NAME_)) print("[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}".format(_H2O_IP_, _H2O_PORT_))) auth = None if _LDAP_USER_NAME_ is not None and _LDAP_PASSWORD_ is not None: auth = (_LDAP_USER_NAME_, _LDAP_PASSWORD_) elif _KERB_PRINCIPAL_ is not None: from h2o.auth import SpnegoAuth auth = SpnegoAuth(service_principal=_KERB_PRINCIPAL_) h2o.connect(ip=_H2O_IP_, port=_H2O_PORT_, verbose=False, auth=auth, **_H2O_EXTRA_CONNECT_ARGS_) h2o.utils.config.H2OConfigReader.get_config()["general.allow_breaking_changes"] = True #rest_log = os.path.join(_RESULTS_DIR_, "rest.log") #h2o.start_logging(rest_log) #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log)) h2o.log_and_echo("------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_) h2o.log_and_echo("") h2o.log_and_echo("------------------------------------------------------------") h2o.remove_all() if _IS_IPYNB_: pydemo_utils.ipy_notebook_exec(_TEST_NAME_) elif _IS_PYUNIT_: pyunit_utils.pyunit_exec(_TEST_NAME_) elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_) elif _IS_PYDEMO_: pydemo_utils.pydemo_exec(_TEST_NAME_)
def sort(): df = h2o.create_frame(rows=10, cols=3, factors=10, categorical_fraction=1.0/3, time_fraction=1.0/3, real_fraction=1.0/3, real_range=100, missing_fraction=0.0, seed=123) df1 = df.sort("C1") assert df1[0,0] == 433225652950 # 1983-09-24 04:27:32 assert df1[9,0] == 1532907020199 # 2018-07-29 23:30:20 df2 = df.sort("C2") assert df2[0,1] == "c1.l1" assert df2[9,1] == "c1.l9" h2o.remove_all()
def start_h2o(self, thread_count=-1, gb_ram_count=26): """Initializes a connection to H2O instance and clear it out, if needed. :param thread_count: the number of threads that H2O may use or -1 if all available (Default value = -1) :param gb_ram_count: the number of gigabytes of RAM that H2O may use (Default value = 26) """ h2o.init(nthreads=thread_count, max_mem_size=gb_ram_count) # clear out cluster return(h2o.remove_all())
def run_test(sys_args, test_to_run): # import pkg_resources # ver = pkg_resources.get_distribution("h2o").version # print "H2O PYTHON PACKAGE VERSION: " + str(ver) ip, port = sys_args[2].split(":") h2o.init(ip,port,strict_version_check=False) h2o.log_and_echo("------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST: "+str(h2o.ou())) h2o.log_and_echo("") h2o.log_and_echo("------------------------------------------------------------") num_keys = h2o.store_size() try: if len(sys_args) > 3 and sys_args[3] == "--ipynb": utils.ipy_notebook_exec(sys_args[4],save_and_norun=False) else: test_to_run(ip, port) finally: h2o.remove_all() if h2o.keys_leaked(num_keys): print "Leaked Keys!"
def deepwater_lenet(): if not H2ODeepWaterEstimator.available(): return frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) print(frame.head(5)) model = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-3, network='lenet', score_interval=0, train_samples_per_iteration=1000) model.train(x=[0],y=1, training_frame=frame) extracted = model.deepfeatures(frame, "pooling1_output") #print(extracted.describe()) print(extracted.ncols) assert extracted.ncols == 800, "extracted frame doesn't have 800 columns" extracted = model.deepfeatures(frame, "activation2_output") #print(extracted.describe()) print(extracted.ncols) assert extracted.ncols == 500, "extracted frame doesn't have 500 columns" h2o.remove_all()
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def h2o_test_setup(sys_args): h2o_py_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..")) h2o_docs_dir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","h2o-docs")) parse_args(sys_args) sys.path.insert(1, h2o_py_dir) import h2o from tests import pyunit_utils, pydemo_utils, pybooklet_utils set_pyunit_pkg_attrs(pyunit_utils) set_pybooklet_pkg_attrs(pybooklet_utils) if _IS_PYUNIT_ or _IS_IPYNB_ or _IS_PYBOOKLET_: pass elif _IS_PYDEMO_: raise(NotImplementedError, "pydemos are not supported at this time") else: raise(EnvironmentError, "Unrecognized test type. Must be of type ipynb, pydemo, pyunit, or pybooklet, but got: " "{0}".format(_TEST_NAME_)) print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Connect to h2o on IP: {0} PORT: {1}" "".format(_H2O_IP_, _H2O_PORT_)) h2o.init(ip=_H2O_IP_, port=_H2O_PORT_, strict_version_check=False) #rest_log = os.path.join(_RESULTS_DIR_, "rest.log") #h2o.start_logging(rest_log) #print "[{0}] {1}\n".format(strftime("%Y-%m-%d %H:%M:%S", gmtime()), "Started rest logging in: {0}".format(rest_log)) h2o.log_and_echo("------------------------------------------------------------") h2o.log_and_echo("") h2o.log_and_echo("STARTING TEST: " + _TEST_NAME_) h2o.log_and_echo("") h2o.log_and_echo("------------------------------------------------------------") h2o.remove_all() if _IS_IPYNB_: pydemo_utils.ipy_notebook_exec(_TEST_NAME_) elif _IS_PYUNIT_: pyunit_utils.pyunit_exec(_TEST_NAME_) elif _IS_PYBOOKLET_: pybooklet_utils.pybooklet_exec(_TEST_NAME_)
def glm_binomial_mojo_pojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows PROBLEM="binomial" params = set_params() # set deeplearning model parameters df = pyunit_utils.random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) glmBinomialModel = pyunit_utils.build_save_model_GLM(params, x, train, "response") # build and save mojo model MOJONAME = pyunit_utils.getMojoName(glmBinomialModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glmBinomialModel, TMPDIR, MOJONAME) # load model and perform predict h2o.download_csv(pred_h2o, os.path.join(TMPDIR, "h2oPred.csv")) pred_pojo = pyunit_utils.pojo_predict(glmBinomialModel, TMPDIR, MOJONAME) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk h2o.save_model(glmOrdinalModel, path=TMPDIR, force=True) # save model for debugging print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_mojo, pred_pojo, 0.1, tol=1e-10)
def pca_wideDataset_rotterdam(): h2o.remove_all() print("Importing Rotterdam.csv data...") rotterdamH2O = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/jira/rotterdam.csv.zip")) y = set(["relapse"]) x = list(set(rotterdamH2O.names)-y) transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] print("transform used on dataset is {0}.\n".format(transformN)) buildModel = [False, False, False] buildModel[randint(0, len(buildModel)-1)] = True expNum = 0 if (buildModel[expNum]): # special test with GLRM. Need use_all_levels to be true print("------ Testing GLRM PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345, use_all_factor_levels=True) gramSVD.train(x=x, training_frame=rotterdamH2O) glrmPCA = H2OGeneralizedLowRankEstimator(k=8, transform=transformN, seed=12345, init="Random", max_iterations=10, recover_svd=True, regularization_x="None", regularization_y="None") glrmPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvectors between GramSVD and GLRM...\n") print("@@@@@@ Comparing eigenvalues between GramSVD and GLRM...\n") pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"], glrmPCA._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1, check_all=False) # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["eigenvectors"], glrmPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) h2o.remove(gramSVD) h2o.remove(glrmPCA) expNum=expNum+1 if (buildModel[expNum]): print("------ Testing Power PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) powerPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Power", seed=12345) # power powerPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Power...\n") pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"], powerPCA._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1e-6, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["eigenvectors"], powerPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) expNum=expNum+1 if (buildModel[expNum]): print("------ Testing Randomized PCA --------") gramSVD = H2OPCA(k=8, impute_missing=True, transform=transformN, seed=12345) gramSVD.train(x=x, training_frame=rotterdamH2O) randomizedPCA = H2OPCA(k=8, impute_missing=True, transform=transformN, pca_method="Randomized", seed=12345, max_iterations=5) # power randomizedPCA.train(x=x, training_frame=rotterdamH2O) # compare singular values and stuff with GramSVD print("@@@@@@ Comparing eigenvalues between GramSVD and Randomized...\n") pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["importance"], randomizedPCA._model_json["output"]["importance"], ["Standard deviation", "Cumulative Proportion", "Cumulative Proportion"], tolerance=1e-1, check_all=False) print("@@@@@@ Comparing eigenvectors between GramSVD and Power...\n") # compare singular vectors pyunit_utils.assert_H2OTwoDimTable_equal(gramSVD._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["eigenvectors"], randomizedPCA._model_json["output"]["names"], tolerance=1e-6, check_sign=True, check_all=False) h2o.remove_all()
def setup_dataset(): h2o.remove_all() train = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) return train
# coding: utf-8 # In[ ]: import h2o import numpy as np import matplotlib.patches as mpatches import matplotlib.pyplot as plt from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator import os # In[ ]: h2o.init() h2o.remove_all() # Clean slate - just in case the cluster was already running # In[ ]: from h2o.h2o import _locate # private function. used to find files within h2o git project directory. # Import walking gait data gait = h2o.import_file(path=os.path.realpath("../data/subject01_walk1.csv")) gait.describe() # In[ ]: # Plot first row of data on x- vs. y-coordinate features gait_row = gait[1,:].drop("Time")
def s3timings(ip, port): t = time.time() # connect to cluster h2o.init(ip, port) # defining timers air_run = timeit.Timer(stmt = 'h2o.import_frame("s3n://h2o-airlines-unpacked/allyears.1987.2013.csv")', setup = 'import h2o') bigx_run = timeit.Timer(stmt = 'h2o.import_frame("s3://h2o-public-test-data/bigdata/server/flow-tests/BigCross.data")', setup = 'import h2o') higg_run = timeit.Timer(stmt = 'h2o.import_frame("s3://h2o-public-test-data/bigdata/server/HIGGS.csv")', setup = 'import h2o') citi_run = timeit.Timer(stmt = 'h2o.import_frame(path = big_citi)', setup = 'import h2o;\ big_citi = ["s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-07.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-08.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-09.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-10.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-11.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-12.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-01.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-02.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-03.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-04.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-05.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-06.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-07.csv",\ "s3://h2o-public-test-data/bigdata/laptop/citibike-nyc/2014-08.csv"]') mils_run = timeit.Timer(stmt = 'h2o.import_frame(path = mill_songs)', setup = 'import h2o;\ mill_songs = ["s3://h2o-public-test-data/bigdata/server/milsongs/milsongs-test.csv",\ "s3://h2o-public-test-data/bigdata/server/milsongs/milsongs-train.csv"]') cup_run = timeit.Timer(stmt = 'h2o.import_frame(path = cup98)', setup = 'import h2o;\ cup98 = ["s3://h2o-public-test-data/bigdata/laptop/usecases/cup98LRN_z.csv",\ "s3://h2o-public-test-data/bigdata/laptop/usecases/cup98VAL_z.csv"]') mnist_run = timeit.Timer(stmt = 'h2o.import_frame(path = mnist)', setup = 'import h2o;\ mnist = ["s3://h2o-public-test-data/bigdata/laptop/mnist/test.csv.gz",\ "s3://h2o-public-test-data/bigdata/laptop/mnist/train.csv.gz"]') arc_run = timeit.Timer(stmt = 'h2o.import_frame(path = arcene)', setup = 'import h2o;\ arcene = ["s3://h2o-public-test-data/smalldata/arcene/arcene_test.data",\ "s3://h2o-public-test-data/smalldata/arcene/arcene_train.data",\ "s3://h2o-public-test-data/smalldata/arcene/arcene_valid.data"]') # Running with timers air_first = air_run.timeit(number=1) bigx_first = bigx_run.timeit(number=1) higg_first = higg_run.timeit(number=1) citi_first = citi_run.timeit(number=1) mils_first = mils_run.timeit(number=1) cup_first = cup_run.timeit(number=1) mnist_first = mnist_run.timeit(number=1) arc_first = arc_run.timeit(number=1) # Clear kvstore and run again s = time.time() h2o.remove_all() print "Elapsed Time for RemoveAll: " + str(time.time() - s) + " (s)." air_second = air_run.timeit(number=1) bigx_second = bigx_run.timeit(number=1) higg_second = higg_run.timeit(number=1) citi_second = citi_run.timeit(number=1) mils_second = mils_run.timeit(number=1) cup_second = cup_run.timeit(number=1) mnist_second = mnist_run.timeit(number=1) arc_second = arc_run.timeit(number=1) print("Airlines: " + str(air_first) + " vs " + str(air_second)) print("BigCross: " + str(bigx_first) + " vs " + str(bigx_second)) print("Higgs: " + str(higg_first) + " vs " + str(higg_second)) print("Citi_bikes: " + str(citi_first) + " vs " + str(citi_second)) print("Million Songs: " + str(mils_first) + " vs " + str(mils_second)) print("KDD Cup98: " + str(cup_first) + " vs " + str(cup_second)) print("Mnist: " + str(mnist_first) + " vs " + str(mnist_second)) print("Arcene: " + str(arc_first) + " vs " + str(arc_second)) s = time.time() h2o.remove_all() print "Elapsed Time for RemoveAll: " + str(time.time() - s) + " (s)." print "Exiting scope... Test elapsed time: " + str(time.time() - t) + " (s)."
# # Load the H2O Python module. # In[ ]: import h2o import os # ### Start H2O # Start up a 1-node H2O cloud on your local machine, and allow it to use all CPU cores and up to 2GB of memory: # In[ ]: h2o.init(max_mem_size_GB = 2) #uses all cores by default h2o.remove_all() #clean slate, in case cluster was already running # To learn more about the h2o package itself, we can use Python's builtin help() function. # In[ ]: help(h2o) # help() can be used on H2O functions and models. Jupyter's builtin shift-tab functionality also works # In[ ]: from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.random_forest import H2ORandomForestEstimator