def test_load_glrm(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() g_model = H2OGeneralizedLowRankEstimator(k=3) g_model.train(x=irisH2O.names, training_frame=irisH2O) yarch_old = g_model.archetypes() x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"]) predOld = g_model.predict(irisH2O) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results")) try: TMPDIR = pyunit_utils.locate("results") # find directory path to results folder except: os.makedirs(TMPDIR) h2o.save_model(g_model, path=TMPDIR, force=True) # save model full_path_filename = os.path.join(TMPDIR, g_model._id) h2o.remove(g_model) model_reloaded = h2o.load_model(full_path_filename) pred = model_reloaded.predict(irisH2O) yarch = model_reloaded.archetypes() x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"]) # assert difference between old and new are close, archetypes should be the same pyunit_utils.compare_frames_local(x, x_old, tol=1e-6) pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1) for k in range(3): pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10) print("glrm model successfully loaded...")
def run_comparison_tests(auto_encoder, act_fun, missing_values_handling, set_all_factor, train, test, x): # set deeplearning model parameters params = set_params(act_fun, missing_values_handling, set_all_factor, auto_encoder) if auto_encoder: try: # build and save mojo model deeplearning_model = build_save_model(params, x, train) except Exception as err: if not("Trying to predict with an unstable model" in err.args[0]): raise Exception('Deeplearning autoencoder model failed to build. Fix it.') return else: # build and save mojo model deeplearning_model = build_save_model(params, x, train) # save test file, h2o predict/mojo use same file h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # load model and perform predict pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearning_model, TMPDIR, MOJONAME) pred_pojo = pyunit_utils.pojo_predict(deeplearning_model, TMPDIR, MOJONAME) # save model for debugging h2o.save_model(deeplearning_model, path=TMPDIR, force=True) print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def deeplearning_mojo_pojo(): h2o.remove_all() params = set_params() # set deeplearning model parameters df = random_dataset(PROBLEM) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) try: deeplearningModel = build_save_model(params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_h2o, pred_mojo, 0.1, tol=1e-10) # make sure operation sequence is preserved from Tomk print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_numeric_frames(pred_mojo, pred_pojo, 0.1, tol=1e-10) except Exception as ex: print("*************** ERROR and type is ") print(str(type(ex))) print(ex) if "AssertionError" in str(type(ex)): # only care if there is an AssertionError, ignore the others sys.exit(1)
def impute_data(method = "mean", to_impute = to_impute, predictors = predictors): if method == "mean": print "Mean imputing missing data for predictors:", to_impute # find mean for each time period in data for each predictor, save them in a matrix with a col for the mean values of each predictor # then on holdout use this table to fill in all missing values based on the time period (row) and the variable (col) of this matrix #if using python module h2o-3.1.0.3131: grouped = data.group_by(["time_period"]) # gm = [grouped.mean(predictor, na="rm").get_frame() for predictor in to_impute] gm = d["time_period"].unique() print "Finding means..." for predictor in to_impute: gm = gm.cbind(d.group_by(["time_period"], {predictor:["mean", d.names().index(predictor), "rm"]}, order_by = 0)) gm.show() print "Saving the imputation means to disk..." h2o.download_csv(gm, filename = saving_means_fp) # df_py = h2o.as_list(gm) # Now that's stored for the holdout data, do this a faster way in java for the training data: for predictor in to_impute: d.impute(predictor, method='mean', by = ['time_period'], inplace = True) print "Done imputing", predictor print "Saving the final mean imputed data to disk..." h2o.export_file(frame = d, path =saving_meanImputed_fp, force=True) if method == "model": # sequentially impute 'newdata', not 'data', so the order of the predictor variables in the loop does not matter # otherwise, you would be using increasingly imputed data to make predictions as the loop progresses. newdata = d # With training data, build a model for each col and predict missing data, save the models, use them on the holdout data to predict all missing data. for predictor in to_impute: print "Building model for imputing " + predictor print "Subsetting the data into missing values for predictor and no missing values for predictor" na_ind = d[predictor].isna() not_na_ind = na_ind != 1.0 to_train = d[not_na_ind] to_predict = d[na_ind] these_var = [var for var in predictors if var != predictor] trained = h2o.gbm(x = to_train[these_var], y = to_train[[predictor]], ntrees=300, max_depth=6, learn_rate=0.2) print "Saving the imputation tree model for " + predictor h2o.save_model(trained, dir = saving_models_fp, name = "dl_imputation_model_" + predictor) print "Imputing the missing " + predictor + " data by predicting with the model..." predicted = trained.predict(to_predict[these_var]) tofillin = newdata[predictor] assert len(predicted) == len(tofillin[na_ind]) tofillin[na_ind] = predicted # mutate the column in place newdata[predictor] = tofillin print "Saving the final model-imputed data to disk..." h2o.export_file(frame = d, path =saving_modelImputed_fp, force=True)
def test_hadoop(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=range(1, 10), y=0, training_frame=h2o_data) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") h2o.save_model(h2o_glm, "hdfs://" + hdfs_model_path) new_model = h2o.load_model("hdfs://" + hdfs_model_path)
def test_hadoop(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv") print("Spliting data") train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,0:2], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,1] - new_test[:,1]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def _save_internal(self, **kwargs): loc = kwargs.pop('location') model_loc = kwargs.pop('model_location') # first, save the estimator... if it's there ends_in_h2o = isinstance(self._final_estimator, H2OEstimator) if ends_in_h2o: force = kwargs.pop('force', False) self.model_loc_ = h2o.save_model(model=self._final_estimator, path=model_loc, force=force) # set the _final_estimator to None just for pickling self.est_name_ = self.steps[-1][0] # let's keep a pointer to the last step, so # after the pickling we can reassign it to retain state last_step_ = self.steps[-1] self.steps[-1] = None # now save the rest of things... with open(loc, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL) # after pickle, we can add the last_step_ back in. # this allows re-use/re-predict after saving to disk if ends_in_h2o: self.steps[-1] = last_step_
def deepwater_checkpoint(): if not H2ODeepWaterEstimator.available(): return ## build a model #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() print(frame.head(5)) model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0) model.train(y=1, training_frame=frame) ## save the model model_path = h2o.save_model(model) ## delete everything - simulate cluster shutdown and restart h2o.remove_all() ## reimport the model and the frame model = h2o.load_model(model_path) #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() ## delete the checkpoint file os.remove(model_path) ## continue training model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0,score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id) model2.train(y=1, training_frame=frame) model2.show()
def milsong_checkpoint(ip,port): milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def test_hdfs_io(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv") print("Spliting data") for c in ["Month","DayofMonth","IsArrDelayed"]: h2o_data[c] = h2o_data[c].asfactor() myX = ["Month","DayofMonth","Distance"] train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = H2OGradientBoostingEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, distribution=distribution) model1.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, checkpoint=restored_model.model_id) model2.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) model3 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution) model3.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)
def _save_internal(self, **kwargs): check_is_fitted(self, 'best_estimator_') best_estimator = self.best_estimator_ estimator = self.estimator # where we'll save things loc = kwargs.pop('location') model_loc = kwargs.pop('model_location') # need to save the h2o est before anything else. Note that since # we verify pre-fit that the _final_estimator is of type H2OEstimator, # we can assume nothing has changed internally... is_pipe = False if isinstance(best_estimator, H2OPipeline): self.est_name_ = best_estimator.steps[-1][0] # don't need to duplicate--can use for base the_h2o_est = best_estimator._final_estimator the_base_est = estimator._final_estimator is_pipe = True else: # otherwise it's the H2OEstimator the_h2o_est = best_estimator the_base_est = estimator # get the key that will map to the new H2OEstimator self.est_type_ = _get_estimator_string(the_base_est) # first, save the best estimator's H2O piece... force = kwargs.pop('force', False) self.model_loc_ = h2o.save_model(model=the_h2o_est, path=model_loc, force=force) # set to none for pickling, and then restore state for scoring if is_pipe: last_step_ = best_estimator.steps[-1] best_estimator.steps[-1] = None base_last_step_ = estimator.steps[-1] estimator.steps[-1] = None self.base_estimator_parms_ = base_last_step_[1]._parms # it's a tuple... else: last_step_ = self.best_estimator_ base_last_step_ = self.estimator self.best_estimator_ = None self.estimator = None self.base_estimator_parms_ = base_last_step_._parms # now save the rest of things... with open(loc, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL) # restore state for re-use if is_pipe: best_estimator.steps[-1] = last_step_ estimator.steps[-1] = base_last_step_ else: self.best_estimator_ = last_step_ self.estimator = base_last_step_
def train_grid_classifier(): global train_dataset train_dataset = get_train_dataset_path() training_data = h2o.import_file(train_dataset) test_data = h2o.import_file(train_dataset.replace('train', 'test')) for mtries,sample_rate in shuffled(list(itertools.product([1, 2, 3, 5, 6, 7], [0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 1.0]))): features_string = '_'.join(map(str, ['mtries', mtries, 'sample_rate', sample_rate])) model_path = sdir_path(get_classifier_name() + '_v7_' + features_string + '.h2o') if path.exists(model_path): continue print features_string try: classifier = h2o.estimators.H2ORandomForestEstimator(build_tree_one_node=True, mtries=mtries, sample_rate=sample_rate) classifier.train(x=training_data.columns[1:], y=training_data.columns[0], training_frame=training_data, validation_frame=test_data) h2o.save_model(classifier, model_path) print classifier except: print traceback.format_exc() continue
def save_model(model_id, dest_dir='.', mformat='json'): model = h2o.get_model(model_id) if mformat == 'mojo': return model.save_mojo(path=dest_dir) # model.download_mojo(path=dest_dir, get_genmodel_jar=True) elif mformat == 'binary': return h2o.save_model(model, path=dest_dir) # return h2o.download_model(model, path=dest_dir) else: return model.save_model_details(path=dest_dir)
def trainmodel(): h2o.init() from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glme trainingdf = h2o.import_file(path = abspath('./trainingset.csv')) trainingdf["city"] = trainingdf["city"].asfactor() trainingdf["country"] = trainingdf["city"].asfactor() glm_classifier = glme(family = "gaussian") glm_classifier.train(x = ['amount','cost','ratio','duration','city','country','ontime','notontime','history','posvote','negvote','fees','feeratio','pastscore'],y = 'score', training_frame = trainingdf) savedir = h2o.save_model(glm_classifier, path = curdir, force = True) rename(basename(savedir),"model")
def h2osave_model(): """ Python API test: h2o.save_model(model, path=u'', force=False) """ training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10] model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5) model.train(x=X, y=Y, training_frame=training_data) try: results_dir = pyunit_utils.locate("results") # find directory path to results folder h2o.save_model(model, path=results_dir, force=True) # save model assert os.path.isfile(os.path.join(results_dir, model._id)), "h2o.save_model() command is not working." except Exception as e: if 'File not found' in e.args[0]: print("Directory is not writable. h2o.save_model() command is not tested.") else: assert False, "h2o.save_model() command is not working."
def milsong_checkpoint(): milsong_train = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) from h2o.estimators.gbm import H2OGradientBoostingEstimator model1 = H2OGradientBoostingEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, distribution=distribution) model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir( path), "Expected save directory {0} to exist, but it does not.".format( path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isdir( model_path ), "Expected load directory {0} to exist, but it does not.".format( model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model1 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution) model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)
def milsong_checkpoint(ip, port): milsong_train = h2o.upload_file( h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file( h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0]) # save the model, then load the model model_path = h2o.save_model(model1, force=True) restored_model = h2o.load_model(model_path) shutil.rmtree(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0])
def save_model(model, path, model_type=None): """ save model :param model: a model to be saved :param path: dir to save model :return: saveed path ( dir + saved filename) """ if model_type == 'mojo': return model.save_mojo(path=path, force=True) else: return h2o.save_model(model, path)
def save(self): print("Saving artifacts") if "metadata" in self.artifact_config: print("SAVING METADATA") save_artifact(self.metadata, self.artifact_config["metadata_path"]) if "model_path" in self.artifact_config: model_artifact_name = h2o.save_model(self.automl_pipeline.leader, path=".") with open(model_artifact_name, "rb") as fname: serialized_model = fname.read() write(serialized_model, self.artifact_config["model_path"])
def save(self, path): """ Save the rulefit model. :param path: The path to the directory where the models should be saved. :examples: >>> rulefit = H2ORuleFit() >>> training_data = h2o.import_file("smalldata/gbm_test/titanic.csv", ... col_types = {'pclass': "enum", 'survived': "enum"}) >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"] >>> rulefit.train(x=x,y="survived",training_frame=training_data) >>> rulefit.save(dir_path = "/home/user/my_rulefit/") """ # save random forest models for rf_model in self.rf_models.values(): h2o.save_model(rf_model, path=path) # save glm model h2o.save_model(self.glm, path=path) return path
def RandomForest(estimators=10): model = H2ORandomForestEstimator(model_id="rf_steel_plates" + str(random.randint(1, 10000)), ntrees=200, stopping_rounds=2, score_each_iteration=True, seed=1000000) model.train(X_train, Y_train, training_frame=df) path = h2o.save_model(model, path=os.getcwd()) result = model.predict(test[:-8]) print(model.model_performance(test))
def save(self, dst): try: import h2o except ImportError: raise MissingDependencyException( "h2o package is required to use H2oModelArtifact" ) h2o_saved_path = h2o.save_model(model=self._model, path=dst, force=True) shutil.move(h2o_saved_path, self._model_file_path(dst)) return
def save_load_model(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm, name="delete_model", force=True) the_model = h2o.load_model(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def train_model(config, pt_bin, yaml_file, prefix): train_f = definitions.PROCESSING_FOLDER + config + '/ml-dataset/ml_sample_train_' + str( pt_bin) + '.parquet' train = h2o.import_file(train_f) d_cuts = configyaml.ConfigYaml(yaml_file) # Configuration of the GRID Search features = d_cuts.values['model_building']['features'] target = d_cuts.values['model_building']['target'] parameters = d_cuts.values['model_building']['model_parameters'] train[target] = train[target] > -1 train[target] = train[target].asfactor() model = H2OXGBoostEstimator(**parameters) model.train(features, target, training_frame=train) place_to_save = definitions.PROCESSING_FOLDER + config + '/ml-dataset/' file_list_saved = list() # Save Main model path_main = h2o.save_model(model, place_to_save, force=True) path_main_rename = ''.join([ x + '/' for x in path_main.split('/')[:-1] ]) + prefix + 'model_pt' + str(pt_bin) + '_main' os.rename(path_main, path_main_rename) file_list_saved.append(path_main_rename) model_list = model.cross_validation_models() for model_cv, i in zip(model_list, range(len(model_list))): path = h2o.save_model(model_cv, place_to_save, force=True) path_new = ''.join([ x + '/' for x in path.split('/')[:-1] ]) + prefix + 'model_pt' + str(pt_bin) + '_cv' + str(i) os.rename(path, path_new) file_list_saved.append(path_new) return model, model_list, file_list_saved
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def create_model(params): """Creates model based on parameters. Args: params (dict): Model parameters. Returns: None """ # Create and train model model = H2ODeepLearningEstimator(**params) model.train(x='x', y='y', training_frame=train, validation_frame=val) # Run model prediction pred_train_val = model.predict(train_val).as_data_frame() pred_test = model.predict(test).as_data_frame() # Plot real data plt.plot(df_train_val['x'], df_train_val['y'], color='orange') plt.plot(df_test['x'][:len(df_test)/2], df_test['y'][:len(df_test)/2], color='orange') plt.plot(df_test['x'][len(df_test)/2:], df_test['y'][len(df_test)/2:], color='orange') # Plot model predictions plt.plot(df_train_val['x'], pred_train_val, color='blue') plt.plot(df_test['x'][:len(df_test)/2], pred_test[:len(pred_test)/2], color='blue') plt.plot(df_test['x'][len(df_test)/2:], pred_test[len(pred_test)/2:], color='blue') # Get model metrics test_rmse = '{:.1f}'.format(model.model_performance(test).rmse()) train_val_rmse = '{:.1f}'.format(model.model_performance(train_val).rmse()) samples_per_iteration = int(round(model.train_samples_per_iteration*1.258/len(df_train_val))) # Save model and plot name = '{0}_{1}_{2}_{3}'.format(test_rmse, train_val_rmse, model.activation, samples_per_iteration) h2o.save_model(model, os.path.join(save_dir, name)) plt.savefig('{}.svg'.format(os.path.join(save_dir, name))) # Close plot plt.close()
def test_load_glrm(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file( pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() g_model = H2OGeneralizedLowRankEstimator(k=3) g_model.train(x=irisH2O.names, training_frame=irisH2O) yarch_old = g_model.archetypes() x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"]) predOld = g_model.predict(irisH2O) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results")) try: TMPDIR = pyunit_utils.locate( "results") # find directory path to results folder except: os.makedirs(TMPDIR) h2o.save_model(g_model, path=TMPDIR, force=True) # save model full_path_filename = os.path.join(TMPDIR, g_model._id) h2o.remove(g_model) model_reloaded = h2o.load_model(full_path_filename) pred = model_reloaded.predict(irisH2O) yarch = model_reloaded.archetypes() x = h2o.get_frame( model_reloaded._model_json["output"]["representation_name"]) # assert difference between old and new are close, archetypes should be the same pyunit_utils.compare_frames_local(x, x_old, tol=1e-6) pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1) for k in range(3): pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps=1e-4, tolerance=1e-10) print("glrm model successfully loaded...")
def save_load_model(): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm,force=True) the_model = h2o.load_model(model_path) shutil.rmtree(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def save_load_model(ip,port): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm,force=True) the_model = h2o.load_model(model_path) shutil.rmtree(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def save_load_model(): prostate = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE","RACE","PSA","DCAPS"], y="CAPSULE", training_frame=prostate) path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(prostate_glm, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) the_model = h2o.load_model(model_path) assert isinstance(the_model, H2OEstimator), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def explore_classification(label_var='bogus_online',create_plots=False): """ see also file shekhar_bogus_ml.py for the source of some of this """ feature_sets = FEATURE_SETS rf_models = [H2ORandomForestEstimator( model_id="rf_v{}".format(i), ntrees=200, stopping_rounds=2, score_each_iteration=True, seed=1000000) \ for i in xrange(1,8)] raw_input('train all models - takes an hour! Are you sure? [Ctrl-C to abort]') for i in xrange(len(feature_sets)): print "Building Model {}".format(i+1) rf_models[i].train(feature_sets[i], label_var, training_frame=TrainData, validation_frame=ValidData) if create_plots: legends=["Return features","Profile features","Network features","1 + 2","1 + 3","2 + 3","1 + 2 + 3"] #legends=["return_features","ds_features","return_features+ds_features"] plot=compare_models(rf_models,legends, of=r'D:\shekhar_code_github\BogusFirmCatching\Graphs\{label}_comparison_plot_AllCombinations_minusq12_numericmerge_withds.html'.format(label=label_var),\ title='Comparing All Models, {label}'.format(label=label_var)) show(plot) for i in xrange(len(rf_models)): h2o.save_model(rf_models[i],path=r'D:\shekhar_code_github\BogusFirmCatching\Models\diff_feature_sets\20170523') for i in xrange(7): show(analyze_model(rf_models[i],of=r"D:\shekhar_code_github\BogusFirmCatching\Graphs\{}_model{}_v2_numericmerge_withds.html".format(label_var,i+1),n_rows=30)) file_name = r'Z:\Predictions_{label}_v2_numericmerge_withDS.csv'.format(label=label_var) generate_predictions(rf_models,ValidData,file_name,'{label}_Model'.format(label=label_var)) predictions = pd.read_csv(file_name)
def runComparisonTests(autoEncoder, probleyType): params = set_params(autoEncoder) # set deeplearning model parameters df = random_dataset(probleyType) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = list(set(df.names) - {"response"}) if autoEncoder: try: deeplearningModel = build_save_model( params, x, train) # build and save mojo model except Exception as err: if not ("Trying to predict with an unstable model" in err.args[0]): raise Exception( 'Deeplearning autoencoder model failed to build. Fix it.') return else: deeplearningModel = build_save_model( params, x, train) # build and save mojo model h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( deeplearningModel, TMPDIR, MOJONAME) # load model and perform predict pred_pojo = pyunit_utils.pojo_predict(deeplearningModel, TMPDIR, MOJONAME) h2o.save_model(deeplearningModel, path=TMPDIR, force=True) # save model for debugging print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_h2o, pred_mojo, prob=1, tol=1e-10) print("Comparing pojo predict and h2o predict...") pyunit_utils.compare_frames_local_onecolumn_NA(pred_mojo, pred_pojo, prob=1, tol=1e-10)
def best_case_model(self, path: str = None) -> str: """ The best model that can be produced for future predictions Args: path (str): Optional location of saved model Return: """ if self._best_case_model is None: self.model_shell.calibrate_thresholds(self.data) if path is None: path = os.path.join(self.dir_path, 'models/') self._best_case_model = h2o.save_model(self.model_shell.model, path=path, force=True) print(type(self._best_case_model)) return self._best_case_model
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = H2ORandomForestEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, seed=1234) model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = H2ORandomForestEstimator( ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, checkpoint=restored_model._id, seed=1234 ) model2.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # build the equivalent of model 2 in one shot model3 = H2ORandomForestEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, seed=1234) model3.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) assert isinstance(model2, type(model3)) assert model2.mse(valid=True) == model3.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model3.mse(valid=True) )
def trainmodel(): h2o.init() from h2o.estimators.glm import H2OGeneralizedLinearEstimator as glme trainingdf = h2o.import_file(path=abspath('./trainingset.csv')) trainingdf["city"] = trainingdf["city"].asfactor() trainingdf["country"] = trainingdf["city"].asfactor() glm_classifier = glme(family="gaussian") glm_classifier.train(x=[ 'amount', 'cost', 'ratio', 'duration', 'city', 'country', 'ontime', 'notontime', 'history', 'posvote', 'negvote', 'fees', 'feeratio', 'pastscore' ], y='score', training_frame=trainingdf) savedir = h2o.save_model(glm_classifier, path=curdir, force=True) rename(basename(savedir), "model")
def save_load_model(): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results")) assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(prostate_glm, path=path, force=True) assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path) the_model = h2o.load_model(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def save_model_filename(): fr = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) model = H2OGradientBoostingEstimator(ntrees=10, seed=1234) model.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr) # Default filename is model_id model_path = h2o.save_model(model) # It should be saved in server working directory assert model_path.endswith( model.model_id), "Not expected path: {0}".format(model_path) loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Default filename is model_id tmpdir = tempfile.mkdtemp() model_path = h2o.save_model(model, tmpdir) assert_equals(os.path.join(tmpdir, model.model_id), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = h2o.save_model(model, tmpdir, filename="gbm_prostate") assert_equals(os.path.join(tmpdir, "gbm_prostate"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = h2o.save_model(model, tmpdir, filename="gbm_prostate.model") assert_equals(os.path.join(tmpdir, "gbm_prostate.model"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = h2o.save_model(model, tmpdir, filename=os.path.join("not-existing-folder", "gbm_prostate.model")) assert_equals( os.path.join(tmpdir, "not-existing-folder", "gbm_prostate.model"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with default path model_path = h2o.save_model(model, filename="gbm_prostate_saved.model") assert model_path.endswith("gbm_prostate_saved.model" ), "Not expected path: {0}".format(model_path) loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator)
def deepwater_checkpoint(): if not H2ODeepWaterEstimator.available(): return ## build a model #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() print(frame.head(5)) model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0) model.train(y=1, training_frame=frame) ## save the model model_path = h2o.save_model(model) ## delete everything - simulate cluster shutdown and restart h2o.remove_all() ## reimport the model and the frame model = h2o.load_model(model_path) #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() ## delete the checkpoint file os.remove(model_path) ## continue training model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id) model2.train(y=1, training_frame=frame) model2.show()
def isolation_forest_save_and_load(): print("Isolation Forest Smoke Test") train = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv")) if_model = H2OIsolationForestEstimator(ntrees=7, seed=12, sample_size=5) if_model.train(training_frame=train) path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(if_model, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) reloaded = h2o.load_model(model_path) assert isinstance(reloaded, H2OIsolationForestEstimator), "Expected and H2OIsolationForestEstimator, but got {0}"\ .format(reloaded)
def test_hadoop(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv") print("Spliting data") train,test = h2o_data.split_frame(ratios=[0.1]) h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) print("Training") h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path)
def extended_isolation_forest_save_and_load(): print("Extended Isolation Forest Save Load Test") train = h2o.import_file( pyunit_utils.locate("smalldata/anomaly/single_blob.csv")) eif_model = H2OExtendedIsolationForestEstimator(ntrees=7, seed=12, sample_size=5) eif_model.train(training_frame=train) anomaly_score = eif_model.predict(train) anomaly = anomaly_score['anomaly_score'].as_data_frame( use_pandas=True)["anomaly_score"] path = pyunit_utils.locate("results") assert os.path.isdir( path), "Expected save directory {0} to exist, but it does not.".format( path) model_path = h2o.save_model(eif_model, path=path, force=True) assert os.path.isfile( model_path ), "Expected load file {0} to exist, but it does not.".format(model_path) reloaded = h2o.load_model(model_path) anomaly_score_reloaded = reloaded.predict(train) anomaly_reloaded = anomaly_score_reloaded['anomaly_score'].as_data_frame( use_pandas=True)["anomaly_score"] assert isinstance(reloaded, H2OExtendedIsolationForestEstimator), \ "Expected and H2OExtendedIsolationForestEstimator, but got {0}"\ .format(reloaded) assert (anomaly[0] == anomaly_reloaded[0] ), "Output is not the same after reload" assert anomaly[5] == anomaly_reloaded[ 5], "Output is not the same after reload" assert anomaly[33] == anomaly_reloaded[ 33], "Output is not the same after reload" assert anomaly[256] == anomaly_reloaded[ 256], "Output is not the same after reload" assert anomaly[499] == anomaly_reloaded[ 499], "Output is not the same after reload"
def main(): h2o.init() #df = h2o.import_file(path="smalldata/logreg/prostate.csv") prostate = h2o.load_dataset("prostate") prostate.describe() train, test = prostate.split_frame(ratios=[0.70]) train["CAPSULE"] = train["CAPSULE"].asfactor() test["CAPSULE"] = test["CAPSULE"].asfactor() # Train model from h2o.estimators import H2OGeneralizedLinearEstimator prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=train) prostate_glm.show() predictions = prostate_glm.predict(test) predictions.show() performance = prostate_glm.model_performance(test) performance.show() # Export model model_path = h2o.save_model(prostate_glm, path="./h2o_model", force=True) print(model_path) model = prostate_glm predictions = model.predict(test) predictions.show() performance = model.model_performance(test) performance.show() # Export test data df = test.as_data_frame() with open("data.json", "w") as f: #json.dump(df.to_json(orient='records'), f) #json.dump(df.to_json(orient='columns'), f) json.dump(df.to_json(orient='index'), f)
def train_h2o(X: pd.DataFrame, y: pd.Series, config: Config): h2o.init() X["target"] = y train = h2o.H2OFrame(X) train_x = train.columns train_y = "target" train_x.remove(train_y) if config["mode"] == "classification": train[train_y] = train[train_y].asfactor() aml = H2OAutoML(max_runtime_secs=60) aml.train(x=train_x, y=train_y, training_frame=train) config["model_h2o"] = h2o.save_model(model=aml.leader, path=config.model_dir + "/h2o.model", force=True) print(aml.leaderboard) X.drop("target", axis=1, inplace=True)
def save_model(h2o_model, path, conda_env=None, mlflow_model=Model(), settings=None): """ Save a H2O model to a path on the local file system. :param h2o_model: H2O model to be saved. :param path: Local path where the model is to be saved. :param mlflow_model: MLflow model config this flavor is being added to. """ import h2o path = os.path.abspath(path) if os.path.exists(path): raise Exception("Path '{}' already exists".format(path)) model_dir = os.path.join(path, "model.h2o") os.makedirs(model_dir) # Save h2o-model h2o_save_location = h2o.save_model(model=h2o_model, path=model_dir, force=True) model_file = os.path.basename(h2o_save_location) # Save h2o-settings if settings is None: settings = {} settings['full_file'] = h2o_save_location settings['model_file'] = model_file settings['model_dir'] = model_dir with open(os.path.join(model_dir, "h2o.yaml"), 'w') as settings_file: yaml.safe_dump(settings, stream=settings_file) pyfunc.add_to_model(mlflow_model, loader_module="mlflow.h2o", data="model.h2o", env=conda_env) mlflow_model.add_flavor("h2o", saved_model=model_file, h2o_version=h2o.__version__) mlflow_model.save(os.path.join(path, "MLmodel"))
def save_model(h2o_model, path, conda_env=None, mlflow_model=Model(), settings=None): """ Save an H2O model to a path on the local file system. :param h2o_model: H2O model to be saved. :param path: Local path where the model is to be saved. :param conda_env: Path to a Conda environment file. If provided, this decribes the environment this model should be run in. At minimum, it should specify the dependencies contained in ``mlflow.h2o.DEFAULT_CONDA_ENV``. If `None`, the default ``mlflow.h2o.DEFAULT_CONDA_ENV`` environment will be added to the model. :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to. """ path = os.path.abspath(path) if os.path.exists(path): raise Exception("Path '{}' already exists".format(path)) model_data_subpath = "model.h2o" model_data_path = os.path.join(path, model_data_subpath) os.makedirs(model_data_path) # Save h2o-model h2o_save_location = h2o.save_model(model=h2o_model, path=model_data_path, force=True) model_file = os.path.basename(h2o_save_location) # Save h2o-settings if settings is None: settings = {} settings['full_file'] = h2o_save_location settings['model_file'] = model_file settings['model_dir'] = model_data_path with open(os.path.join(model_data_path, "h2o.yaml"), 'w') as settings_file: yaml.safe_dump(settings, stream=settings_file) conda_env_subpath = "conda.yaml" if conda_env: shutil.copyfile(conda_env, os.path.join(path, conda_env_subpath)) else: with open(os.path.join(path, conda_env_subpath), "w") as f: yaml.safe_dump(DEFAULT_CONDA_ENV, stream=f, default_flow_style=False) pyfunc.add_to_model(mlflow_model, loader_module="mlflow.h2o", data=model_data_subpath, env=conda_env_subpath) mlflow_model.add_flavor(FLAVOR_NAME, h2o_version=h2o.__version__, data=model_data_subpath) mlflow_model.save(os.path.join(path, "MLmodel"))
def log_artifacts(grid_model, script, coeff=False): # check if jar file exists, if it does then delete it as it will already be backed up in mlruns folder if os.path.isfile(os.getcwd() + "/h2o-genmodel.jar"): os.remove(os.getcwd() + "/h2o-genmodel.jar") if os.path.isfile(os.getcwd() + "/var_imp.png"): os.remove(os.getcwd() + "/var_imp.png") if os.path.isfile(os.getcwd() + "/var_imp.csv"): os.remove(os.getcwd() + "/var_imp.csv") model_python = h2o.save_model(model=grid_model, path=os.getcwd(), force=True) # mlflow.log_artifact(model_python) mlflow.log_artifact(model_python) mojo_path = grid_model.download_mojo(path=os.getcwd(), get_genmodel_jar=True) grid_model.varimp_plot(server=True) plt.savefig('var_imp.png', bbox_inches='tight') var_imps = pd.DataFrame(grid_model.varimp(), columns=['name', 'relative_importance', 'scaled_importance', 'percentage']) var_imps.to_csv("var_imp.csv", index=False) # Log mojo file and jar file mlflow.log_artifact(mojo_path) mlflow.log_artifact(os.getcwd() + "/h2o-genmodel.jar") mlflow.log_artifact(os.getcwd() + "/var_imp.png") mlflow.log_artifact(os.getcwd() + "/var_imp.csv") mlflow.log_artifact(os.getcwd() + "/" + script) if coeff: if os.path.isfile(os.getcwd() + "/coefficients-{}.csv".format(datetime.utcnow().date())): os.remove(os.getcwd() + "/coefficients-{}.csv".format(datetime.utcnow().date())) grid_model._model_json['output']['coefficients_table'].as_data_frame().to_csv("coefficients-{}.csv" .format(datetime.utcnow().date())) mlflow.log_artifact(os.getcwd() + "/coefficients-{}.csv".format(datetime.utcnow().date())) return "Finished logging artifacts"
def run_auto_ml(_df_train, _max_runtime_secs=3600, _nfolds=10, _stopping_metric='mse', _sort_metric='mae', _exclude_algos=['DeepLearning']): """ :param _df_train: train DataFrames with all features :param _max_runtime_secs: Int, number of seconds that our auto ml will be learn :param _nfolds: Int, number of fold :param _stopping_metric: Stop metrics for algo :param _sort_metric: Sort metrics for algo :param _exclude_algos: Excluded algo, in default DeepLearning """ print( '>>>>>>>>>>>>>> Preparing model and data for model: --{0}min--'.format( _max_runtime_secs / 60)) hf_train = h2o.H2OFrame(_df_train) aml = H2OAutoML(max_runtime_secs=_max_runtime_secs, nfolds=_nfolds, exclude_algos=_exclude_algos, sort_metric=_sort_metric) list_train_columns = list(_df_train.columns) list_train_columns.remove(features.PRICE) print('>>>>>>>>>>>>>> All set, starting training') aml.train(x=list_train_columns, y=features.PRICE, training_frame=hf_train) print('>>>>>>>>>>>>>> Finished training') saved_path = h2o.save_model(model=aml.leader, path=paths.DIR_MODELS, force=True) print('>>>>>>>>>>>>>> Model saved on path: {}'.format(saved_path)) print('>>>>>>>>>>>>>> Models saved') print('\n\n') print(aml.leaderboard.head())
def save(self): import h2o model_folder_path = os.path.join(SAVED_MODELS_PATH, self.model_id) metadata_path = os.path.join(model_folder_path, 'metadata.json') if not os.path.exists(metadata_path): os.makedirs(model_folder_path) model_path = h2o.save_model(self.model, path=model_folder_path, force=True) with open(metadata_path, 'w') as metadata_file: json.dump({ 'system': self.system, 'model_id': self.model_id, 'search_id': self.search_id, 'model_filename': os.path.basename(model_path), 'predictors': self.predictors, 'targets': self.targets, 'train_specification': self.train_specification, 'task': self.task }, metadata_file)
def milsong_checkpoint(): milsong_train = h2o.upload_file(tests.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(tests.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) # save the model, then load the model path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","results")) assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id,seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) assert isinstance(model2,type(model3)) assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def fit_transform(self, X: dt.Frame, y: np.array = None): h2o.init() model = H2OAutoEncoderEstimator(activation='tanh', epochs=1, hidden=[50, 50], reproducible=True, seed=1234) frame = h2o.H2OFrame(X.to_pandas()) model_path = None try: model.train(x=list(range(X.ncols)), training_frame=frame, model_id=self.id) model_path = h2o.save_model(model=model) with open(model_path, "rb") as f: self.raw_model_bytes = f.read() return model.anomaly(frame).as_data_frame(header=False) finally: if model_path is not None: os.remove(model_path) h2o.remove(self.id)
def test_hdfs_io(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("maprfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv") print("Spliting data") for c in ["Month", "DayofMonth", "IsArrDelayed"]: h2o_data[c] = h2o_data[c].asfactor() myX = ["Month", "DayofMonth", "Distance"] train, test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:, ["Year", "DayOfWeek"]], "maprfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("maprfs://" + hdfs_name_node + "/datasets/exported.csv") assert ((test[:, "DayOfWeek"] - new_test[:, "DayOfWeek"]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") # Does not understand maprfs:// for model saving? new_model_path = h2o.save_model( h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def fit_transform(self, X: dt.Frame, y: np.array = None): h2o.init(port=config.h2o_recipes_port) model = H2OAutoEncoderEstimator(activation='tanh', epochs=1, hidden=[50, 50], reproducible=True, seed=1234) frame = h2o.H2OFrame(X.to_pandas()) model_path = None try: model.train(x=list(range(X.ncols)), training_frame=frame) self.id = model.model_id model_path = os.path.join(temporary_files_path, "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: self.raw_model_bytes = f.read() return model.anomaly(frame).as_data_frame(header=False) finally: if model_path is not None: remove(model_path) h2o.remove(model)
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(list(range(50,100)),1)[0] max_depth1 = random.sample(list(range(2,6)),1)[0] min_rows1 = random.sample(list(range(10,16)),1)[0] print("ntrees model 1: {0}".format(ntrees1)) print("max_depth model 1: {0}".format(max_depth1)) print("min_rows model 1: {0}".format(min_rows1)) model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0]) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print("ntrees model 2: {0}".format(ntrees2)) print("max_depth model 2: {0}".format(max_depth2)) print("min_rows model 2: {0}".format(min_rows2)) model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model.model_id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def test_saved_binary_model_produces_same_predictions_as_original(): ds = prepare_data(blending) base_models = train_base_models(ds) se_model = train_stacked_ensemble(ds, base_models) #Predict in ensemble in Py client preds_py = se_model.predict(ds.test) tmp_dir = tempfile.mkdtemp() try: bin_file = h2o.save_model(se_model, tmp_dir) #Load binary model and predict bin_model = h2o.load_model(pu.locate(bin_file)) preds_bin = bin_model.predict(ds.test) finally: shutil.rmtree(tmp_dir) #Predictions from model in Py and binary model should be the same pred_diff = preds_bin - preds_py assert pred_diff["p0"].max() < 1e-11 assert pred_diff["p1"].max() < 1e-11 assert pred_diff["p0"].min() > -1e-11 assert pred_diff["p1"].min() > -1e-11
def milsong_checkpoint(ip,port): milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) # save the model, then load the model model_path = h2o.save_model(model1,force=True) restored_model = h2o.load_model(model_path) shutil.rmtree(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id,seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) assert isinstance(model2,type(model3)) assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def cars_checkpoint(): cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) s = cars.runif() train = cars[s > .2] valid = cars[s <= .2] print("\n*** Description (chunk distribution, etc) of training frame:") train.describe() print("\n*** Description (chunk distribution, etc) of validation frame:") valid.describe() # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(list(range(3)),1)[0] # pick the predictors and response column, along with the correct distribution predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" distribution = "bernoulli" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() elif problem == 2 : response_col = "cylinders" distribution = "multinomial" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() else : response_col = "economy" distribution = "gaussian" print("\n*** Distribution: {0}".format(distribution)) print("\n*** Response column: {0}".format(response_col)) # build first model ntrees1 = 5 max_depth1 = random.sample(list(range(2,6)),1)[0] min_rows1 = random.sample(list(range(10,16)),1)[0] print("\n*** Building model 1 with the following parameters:") print("*** ntrees model 1: {0}".format(ntrees1)) print("*** max_depth model 1: {0}".format(max_depth1)) print("*** min_rows model 1: {0}".format(min_rows1)) from h2o.estimators.gbm import H2OGradientBoostingEstimator model1 = H2OGradientBoostingEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, distribution=distribution) model1.train(x=predictors, y=response_col,training_frame=train,validation_frame=valid) # model1 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees1, # max_depth=max_depth1, # min_rows=min_rows1, # score_each_iteration=True, # distribution=distribution, # validation_x=valid[predictors], # validation_y=valid[response_col]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 5 max_depth2 = max_depth1 min_rows2 = min_rows1 print("\n*** Continuing to build model 1 (now called model 2) with the following parameters:") print("*** ntrees model 2: {0}".format(ntrees2)) print("*** max_depth model 2: {0}".format(max_depth2)) print("*** min_rows model 2: {0}".format(min_rows2)) model2 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, checkpoint=restored_model._id) model2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) # model2 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees2, # max_depth=max_depth2, # min_rows=min_rows2, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col], # checkpoint=restored_model._id) # continue building the model, but with different number of trees ntrees3 = ntrees2 + 50 max_depth3 = max_depth1 min_rows3 = min_rows1 print("\n*** Continuing to build model 1 (now called model 3) with the following parameters:") print("*** ntrees model 3: {0}".format(ntrees3)) print("*** max_depth model 3: {0}".format(max_depth3)) print("*** min_rows model 3: {0}".format(min_rows3)) model3 = H2OGradientBoostingEstimator(ntrees=ntrees3, max_depth=max_depth3, min_rows=min_rows3, distribution=distribution, score_each_iteration=True, checkpoint=restored_model._id) model3.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) # model3 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees3, # max_depth=max_depth3, # min_rows=min_rows3, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col], # checkpoint=restored_model._id) # build the equivalent of model 2 in one shot print("\n*** Building the equivalent of model 2 (called model 4) in one shot:") model4 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True) model4.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) # model4 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees2, # max_depth=max_depth2, # min_rows=min_rows2, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col]) print("\n*** Model Summary for model 2:") print(model2.summary()) print("\n*** Model Summary for model 3:") print(model3.summary()) print("\n*** Model Summary for model 4:") print(model4.summary()) print("\n*** Score History for model 2:") print(model2.scoring_history()) print("\n*** Score History for model 3:") print(model3.scoring_history()) print("\n*** Score History for model 4:") print(model4.scoring_history()) # checks if problem == 0: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) elif problem == 1: assert isinstance(model2,type(model4)) assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True)) #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True)) assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) else: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) assert model2.r2(valid=True)==model4.r2(valid=True), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(model2.r2(valid=True), model4.r2(valid=True))
test_y = test[y].as_data_frame()[0][1:] print "Testing error is %f" %(sum(map(lambda t: t[0] != t[1],zip(test_pred,test_y)))*1.0/len(test_y)) # Take a look at the predictions of 10 predictions pred.head() # Show prediction and original test data Y values in list # pred_list = pred.as_data_frame()[0] # test_list = test.as_data_frame()[-1] # the model object can not be pickled dump to a file because it is an # instance method # Show top 20 variable importance model.varimp()[:20] model_path = h2o.save_model(model, './mnist_dp_model/') # one can save it to local, s3, hdfs h2o.loadModel(model_path) #model is saved in a folder specifies in model_path ############# ############################################################################## # Perform 5-fold cross-validation on training_frame model_cv = H2ODeepLearningEstimator(distribution="multinomial", activation="RectifierWithDropout", hidden=[32,32,32], input_dropout_ratio=0.2, sparse=True, l1=1e-5, epochs=10, nfolds=5)
features = list(range(0,784)) target = 784 train[target] = train[target].asfactor() valid[target] = valid[target].asfactor() # Build model model = H2ODeepWaterEstimator(epochs=20, activation="Rectifier", hidden=[200,200], ignore_const_cols=False, mini_batch_size=256, input_dropout_ratio=0.1, hidden_dropout_ratios=[0.5,0.5], stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification", score_interval=2, score_duty_cycle=0.5, score_training_samples=1000, score_validation_samples=1000, gpu=True, seed=1234) model.train(x=features, y=target, training_frame=train, validation_frame=valid) # Evaluate model model.show() print(model.scoring_history()) # Checkpoint model model_path = h2o.save_model(model=model, force=True) # Load model model_ckpt = h2o.load_model(model_path) # Start training from checkpoint model_warm = H2ODeepWaterEstimator(checkpoint=model_ckpt.model_id, epochs=100, activation="Rectifier", hidden=[200,200], ignore_const_cols=False, mini_batch_size=256, input_dropout_ratio=0.1, hidden_dropout_ratios=[0.5,0.5], stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification", score_interval=2, score_duty_cycle=0.5, score_training_samples=1000, score_validation_samples=1000, gpu=True, seed=1234) model_warm.train(x=features, y=target, training_frame=train, validation_frame=valid) # Evaluate checkpointed model model_warm.show() print(model_warm.scoring_history())
h2o.init() data = h2o.import_file(path='featurized_data') y_labels = np.load('labels') y_labels = (y_labels > 5) + 0 # as type essentially y_labels = np.reshape(y_labels, (len(y_labels), 1)) data = data.cbind(h2o.H2OFrame(y_labels)) rand = data.runif() train = data[r < 0.6] valid = data[(r >= 0.6) & (r < 0.9)] test = data[r >= 0.9] y = data.col_names()[-1] x = data.col_names()[:-1] gbm = h2o.gbm(x = x, y = y, training_frame = train, validation_frame = valid, max_depth = 5, ntrees=500, learn_rate=0.2, distribution="bernoulli") print gbm r = int(random.random() * len(y_labels)) sample = data[r,:] prediction = h2o.predict(gbm, sample) print prediction h2o.save_model(gbm, "model")