def test_hdfs_io(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/airlines/airlines_all.05p.csv") print("Spliting data") for c in ["Month","DayofMonth","IsArrDelayed"]: h2o_data[c] = h2o_data[c].asfactor() myX = ["Month","DayofMonth","Distance"] train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,["Year","DayOfWeek"]], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,"DayOfWeek"] - new_test[:,"DayOfWeek"]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=myX, y="IsArrDelayed", training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def test_load_glrm(): print("Importing iris_wheader.csv data...") irisH2O = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) irisH2O.describe() g_model = H2OGeneralizedLowRankEstimator(k=3) g_model.train(x=irisH2O.names, training_frame=irisH2O) yarch_old = g_model.archetypes() x_old = h2o.get_frame(g_model._model_json["output"]["representation_name"]) predOld = g_model.predict(irisH2O) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "../..", "results")) try: TMPDIR = pyunit_utils.locate("results") # find directory path to results folder except: os.makedirs(TMPDIR) h2o.save_model(g_model, path=TMPDIR, force=True) # save model full_path_filename = os.path.join(TMPDIR, g_model._id) h2o.remove(g_model) model_reloaded = h2o.load_model(full_path_filename) pred = model_reloaded.predict(irisH2O) yarch = model_reloaded.archetypes() x = h2o.get_frame(model_reloaded._model_json["output"]["representation_name"]) # assert difference between old and new are close, archetypes should be the same pyunit_utils.compare_frames_local(x, x_old, tol=1e-6) pyunit_utils.compare_frames_local(pred[0], predOld[0], tol=1) for k in range(3): pyunit_utils.equal_two_arrays(yarch_old[k], yarch[k], eps = 1e-4, tolerance=1e-10) print("glrm model successfully loaded...")
def test_hadoop(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv") print("Spliting data") train,test = h2o_data.split_frame(ratios=[0.9]) print("Exporting file to hdfs") h2o.export_file(test[:,0:2], "hdfs://" + hdfs_name_node + "/datasets/exported.csv") print("Reading file back in and comparing if data is the same") new_test = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/exported.csv") assert((test[:,1] - new_test[:,1]).sum() == 0) print("Training") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path) print("Running predictions") preds = new_model.predict(test)
def milsong_checkpoint(ip,port): milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def __init__(self, model_name, model_base_path, verbose=False): """ Initialize the service. Args: model_name: The name of the model. model_base_path: The file path of the model. Return: None """ super(H2oInferenceService, self).__init__() self.model_name = model_name self.model_base_path = model_base_path self.model_version_list = [1] self.model_graph_signature = "" self.platform = "H2o" self.verbose = verbose import h2o logging.info("Try to initialize and connect the h2o server") h2o.init() logging.info("Try to load the h2o model") model = h2o.load_model(model_base_path) self.model = model # TODO: Update the signature with readable string self.model_graph_signature = "{}".format(self.model.full_parameters)
def deepwater_checkpoint(): if not H2ODeepWaterEstimator.available(): return ## build a model #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() print(frame.head(5)) model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0) model.train(y=1, training_frame=frame) ## save the model model_path = h2o.save_model(model) ## delete everything - simulate cluster shutdown and restart h2o.remove_all() ## reimport the model and the frame model = h2o.load_model(model_path) #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() ## delete the checkpoint file os.remove(model_path) ## continue training model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0,score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id) model2.train(y=1, training_frame=frame) model2.show()
def stackedensemble_binary_test(): # Import a sample binary outcome train/test set into H2O train = h2o.import_file(pyunit_utils.locate("smalldata/higgs/higgs_train_10k.csv")) test = h2o.import_file(pyunit_utils.locate("smalldata/testng/higgs_test_5k.csv")) # Identify predictors and response x = train.columns y = "response" x.remove(y) # For binary classification, response should be a factor train[y] = train[y].asfactor() test[y] = test[y].asfactor() # Number of CV folds (to generate level-one data for stacking) nfolds = 5 # 1. Generate a 2-model ensemble (GBM + RF) # Train and cross-validate a GBM my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=3, min_rows=2, learn_rate=0.2, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_gbm.train(x=x, y=y, training_frame=train) # Train and cross-validate a RF my_rf = H2ORandomForestEstimator(ntrees=50, nfolds=nfolds, fold_assignment="Modulo", keep_cross_validation_predictions=True, seed=1) my_rf.train(x=x, y=y, training_frame=train) # Train a stacked ensemble using the GBM and DRF above ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial", base_models=[my_gbm.model_id, my_rf.model_id]) ensemble.train(x=x, y=y, training_frame=train) #Predict in ensemble in Py client preds_py = ensemble.predict(test) #Load binary model and predict bin_model = h2o.load_model(pyunit_utils.locate("smalldata/binarymodels/stackedensemble/ensemble_higgs")) preds_bin = bin_model.predict(test) #Predictions from model in Py and binary model should be the same pred_diff = preds_bin - preds_py assert pred_diff["p0"].max() < 1e-11 assert pred_diff["p1"].max() < 1e-11 assert pred_diff["p0"].min() > -1e-11 assert pred_diff["p1"].min() > -1e-11
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = H2OGradientBoostingEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, distribution=distribution) model1.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, checkpoint=restored_model.model_id) model2.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) model3 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution) model3.train(x=range(1,milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)
def _load_model(path, init=False): import h2o path = os.path.abspath(path) with open(os.path.join(path, "h2o.yaml")) as f: params = yaml.safe_load(f.read()) if init: h2o.init(**(params["init"] if "init" in params else {})) h2o.no_progress() return h2o.load_model(os.path.join(path, params['model_file']))
def post(self, request): try: # Recebe os dados enviados pela requisição model_id = request.POST.get('model_id') csv_prever = request.FILES['csv_prever'] if model_id: # Busca o modelo usando o ORM do Django pelo model_id modelo_processado = ModeloMachineLearningProcessado.objects.get( model_id=model_id) # Busca o processamento vinculado ao modelo processado usando a ORM do Django processamento = modelo_processado.processamentos.first() else: # se não informou o model_id, busca-se o melhor modelo do processamento mais recente processamento = ProcessamentoModeloMachineLearning.objects.all( ).first() # o processamento mais recente modelo_processado = processamento.modelos_processados.all( ).first() # o primeiro modelo sempre é o melhor # faz a leitura do arquivo para previsão fazendo uso da biblioteca Pandas teste = pd.read_csv(csv_prever, sep=";") colunas_enviadas = ','.join(teste.columns.tolist()) if processamento.variaveis_independentes != colunas_enviadas: raise Exception( 'Erro no layout do arquivo de previsão: Para este modelo são essenciais as seguintes ' 'colunas:"{variaveis_independentes}", mas você enviou as colunas: "{colunas_enviadas}"' .format(variaveis_independentes=processamento. variaveis_independentes, colunas_enviadas=colunas_enviadas)) # Inicializa a conexão com o h2o h2o.init() teste = h2o.H2OFrame(teste) # Fazer o load do binário do modelo modelo_automl = h2o.load_model( modelo_processado.binario_modelo.name) prever = modelo_automl.predict(teste) data_frame = prever.as_data_frame() """ Formatar os dados de uma forma mais simples, para percorrer depois no JavaScript """ previsoes = list() for i in range(0, len(data_frame['predict']) - 1): previsoes.append({ 'predict': data_frame['predict'][i], 'p0': data_frame['p0'][i], 'p1': data_frame['p1'][i] }) return Response(status=201, data={'previsoes': previsoes}) except Exception as e: return Response(status=401, data={'Erro': str(e)})
def __init__(self): print 'Starting Java virtual machine' h2o.init(nthreads=-1, max_mem_size=8) print 'Machine started!' print 'Loading model from %s...' % MODEL_PATH self.model = h2o.load_model(MODEL_PATH) print 'Model Loaded'
def run_h2o(train_file_path: str, test_file_path: str, task: MachineLearningTasksEnum, case_name='h2o_default'): config_data = get_models_hyperparameters()['H2O'] max_models = config_data['MAX_MODELS'] max_runtime_secs = config_data['MAX_RUNTIME_SECS'] result_filename = f'{case_name}_m{max_models}_rs{max_runtime_secs}_{task.name}' exported_model_path = os.path.join(CURRENT_PATH, result_filename) # TODO Regression if result_filename not in os.listdir(CURRENT_PATH): train_data = InputData.from_csv(train_file_path) best_model = fit_h2o(train_data) temp_exported_model_path = h2o.save_model(model=best_model, path=CURRENT_PATH) os.renames(temp_exported_model_path, exported_model_path) ip, port = get_h2o_connect_config() h2o.init(ip=ip, port=port, name='h2o_server') imported_model = h2o.load_model(exported_model_path) test_frame = InputData.from_csv(test_file_path) true_target = test_frame.target predictions = predict_h2o(imported_model, test_frame) if task is MachineLearningTasksEnum.classification: train_roc_auc_value = round(imported_model.auc(train=True), 3) valid_roc_auc_value = round(imported_model.auc(valid=True), 3) test_roc_auc_value = round(roc_auc_score(true_target, predictions), 3) metrics = { 'H2O_ROC_AUC_train': train_roc_auc_value, 'H2O_ROC_AUC_valid': valid_roc_auc_value, 'H2O_ROC_AUC_test': test_roc_auc_value } print(f"H2O_ROC_AUC_train: {metrics['H2O_ROC_AUC_train']}") print(f"H2O_ROC_AUC_valid: {metrics['H2O_ROC_AUC_valid']}") print(f"H2O_ROC_AUC_test: {metrics['H2O_ROC_AUC_test']}") else: mse_train = imported_model.mse() rmse_train = imported_model.rmse() metrics = {'H2O_MSE_train': mse_train, 'H2O_RMSE_train': rmse_train} print(f"H2O_MSE_train: {metrics['H2O_MSE_train']}") print(f"H2O_RMSE_train: {metrics['H2O_RMSE_train']}") h2o.shutdown(prompt=False) return metrics
def test_modelselection_serialization(): d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) my_y = "GLEASON" my_x = ["AGE", "RACE", "CAPSULE", "DCAPS", "PSA", "VOL", "DPROS"] allsubsets_model = modelSelection(seed=12345, max_predictor_number=7, mode="allsubsets") allsubsets_model.train(training_frame=d, x=my_x, y=my_y) tmpdir = tempfile.mkdtemp() model_path_allsubsets = allsubsets_model.download_model(tmpdir) maxr_model = modelSelection(seed=12345, max_predictor_number=7, mode="maxr") maxr_model.train(training_frame=d, x=my_x, y=my_y) model_path_maxr = maxr_model.download_model(tmpdir) h2o.remove_all() d = h2o.import_file( path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) loaded_allsubsets_model = h2o.load_model(model_path_allsubsets) result_frame_allsubsets = loaded_allsubsets_model.result() numRows = result_frame_allsubsets.nrows modelIDs_allsubsets = loaded_allsubsets_model._model_json["output"][ "best_model_ids"] loaded_maxr_model = h2o.load_model(model_path_maxr) modelIDs_maxr = loaded_allsubsets_model._model_json["output"][ "best_model_ids"] for ind in list(range(numRows)): model_from_frame_allsubsets = h2o.get_model( result_frame_allsubsets["model_id"][ind, 0]) pred_frame_allsubsets = model_from_frame_allsubsets.predict(d) model_from_id_allsubsets = h2o.get_model( modelIDs_allsubsets[ind]['name']) pred_id_allsubsets = model_from_id_allsubsets.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_allsubsets, prob=1) model_from_id_maxr = h2o.get_model(modelIDs_maxr[ind]['name']) pred_id_maxr = model_from_id_maxr.predict(d) pyunit_utils.compare_frames_local(pred_frame_allsubsets, pred_id_maxr, prob=1)
def h2o_pred(test): h2oTest = h2o.H2OFrame(test) saved_model = h2o.load_model(os.getcwd()+"/XGBoost_1_AutoML_20181105_211213") preds = saved_model.predict(h2oTest) preds = preds.as_data_frame() print(saved_model.model_performance(h2oTest)) print("========================================================") print("Saving prediction into csv file") preds.to_csv("test_predict.csv") return preds
def load(self, path): try: import h2o except ImportError: raise MissingDependencyException( "h2o package is required to use H2oModelArtifact") h2o.init() model = h2o.load_model(self._model_file_path(path)) return self.pack(model)
def _read(self): # need to init h2o context so as to use h2o.load_model utils.getH2oContext() # Load the first file under _file_path which should be the model file. # We can do this because if we change the model file, the hash of hash would change # the _file_path and then we would save the model under different path. h2o_model_file = os.listdir(self._file_path)[0] h2o_model_path = os.path.join(self._file_path, h2o_model_file) import h2o return h2o.load_model(h2o_model_path)
def milsong_checkpoint(): milsong_train = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file( pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) from h2o.estimators.gbm import H2OGradientBoostingEstimator model1 = H2OGradientBoostingEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, distribution=distribution) model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir( path), "Expected save directory {0} to exist, but it does not.".format( path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isdir( model_path ), "Expected load directory {0} to exist, but it does not.".format( model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model1 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution) model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid)
def milsong_checkpoint(ip, port): milsong_train = h2o.upload_file( h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file( h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0]) # save the model, then load the model model_path = h2o.save_model(model1, force=True) restored_model = h2o.load_model(model_path) shutil.rmtree(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0], checkpoint=restored_model._id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:], y=milsong_train[0], ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, validation_x=milsong_valid[1:], validation_y=milsong_valid[0])
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types) - 1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame( glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join( TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict( glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR, MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict( glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame( "GLRMLoading_" + frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def transform(self, X: dt.Frame): h2o.init() model_path = os.path.join(temporary_files_path, self.id) with open(model_path, "wb") as f: f.write(self.raw_model_bytes) model = h2o.load_model(model_path) os.remove(model_path) frame = h2o.H2OFrame(X.to_pandas()) try: return model.anomaly(frame).as_data_frame(header=False) finally: h2o.remove(self.id)
def load_model_predict(col_to_predict, test_data): if col_to_predict == 'animal': model_path = "mymodel_animal" elif col_to_predict == "item": model_path = "mymodel_item" elif col_to_predict == "destino": model_path = "mymodel_dest" h2o_test = h2o.H2OFrame(test_data) model = h2o.load_model(model_path) predictions = model.predict(h2o_test) return predictions.columns
def load_h2o_model(local_dir, filename, extension=""): """ Loads a saved H2O Model :param string local_dir: Local directory where the model is saved :param string filename: Filename with which the model is saved :param string extension: Extension to the filename with which the model is saved :return: """ from h2o import load_model return load_model(local_dir + "/" + filename + extension)
def save_load_model(ip,port): # Connect to h2o h2o.init(ip,port) prostate = h2o.import_frame(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm, name="delete_model", force=True) the_model = h2o.load_model(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def predict_activities(data_dict): """ food_hours, food_calories, active_hours, active_rating, sleep_hours, sleep_duration, comfort_rating """ h2o.init() h2o_model = h2o.load_model(os.path.join(BASE_DIR, os.path.join('h2o_models', 'xgboost-activities'))) data_row = h2o.H2OFrame(data_dict, column_names=['food_hours', 'food_calories', 'active_hours', 'active_rating', 'sleep_hours', 'sleep_duration', 'comfort_rating']) data_prediction = h2o_model.predict(data_row) return data_prediction
def save_load_model(ip,port): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm,force=True) the_model = h2o.load_model(model_path) shutil.rmtree(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def save_load_model(): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) model_path = h2o.save_model(prostate_glm,force=True) the_model = h2o.load_model(model_path) shutil.rmtree(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def test_hadoop(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv") h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) h2o_glm.train(x=range(1, 10), y=0, training_frame=h2o_data) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") h2o.save_model(h2o_glm, "hdfs://" + hdfs_model_path) new_model = h2o.load_model("hdfs://" + hdfs_model_path)
def plot(): model = h.load_model('D:/AI/AI_Hub/Head Unit Data/rf_covType_v1') var_im = (model.varimp(1)) var_im = var_im[:5] sns_plot = sns.barplot(x=var_im.variable, y=var_im.percentage, palette="Blues_d") sns_plot.set_xticklabels(sns_plot.get_xticklabels(), rotation=15, fontsize=8) sns_plot.set(xlabel='Features', ylabel='Variable Importance') fig = sns_plot.get_figure() fig.savefig("D:/AI/static/plot.jpg") return render_template('plot.html')
def get_model(cls): """Get the model object for this instance, loading it if it's not already loaded.""" if cls.model is None: for file in os.listdir(model_path): # Assumes that 'AutoML' is somewhere in the filename of a # model that's been generated. We just load the first model # that satisfies this constraint, so caveat emptor if you've # run the 'train' script multiple times - this may still load # the first model. An obvious to-do is to improve this :-) if 'GBM' in file: cls.model = h2o.load_model(os.path.join(model_path, file)) break return cls.model
def predict(self): #Resuse the Saved Random Forest Model model = h2o.load_model('Misc/models/RF_Insurance_model/DRF_model_python_1591600273347_1') #Convert it to a Spark Dataframe customer = self.standardize(self.customer) org_df = self.sc.parallelize([customer]).toDF() df = self.transform_data(org_df) #Obtain a result using the saved model pred = model.predict(self.hc.asH2OFrame(df)) prediction_df = self.hc.asSparkFrame(pred) prediction_df = prediction_df.withColumn("predict", functions.round("predict", 0)) prediction_df = prediction_df.withColumn("predict", prediction_df["predict"].cast(types.IntegerType())) result = prediction_df.collect()[0].predict return result
def test_auto_ml(_model, _df_test): print('>>>>>>>>>>>>>> Import model and test set') model_path = join(paths.DIR_MODELS, _model) model = h2o.load_model(model_path) hf_test = h2o.H2OFrame(_df_test) print('>>>>>>>>>>>>>> Predict results for test set') df_pred = model.predict(hf_test).as_data_frame() print('>>>>>>>>>>>>>> Calculate mean absolute error') m_a_e = mean_absolute_error(df_pred.values, _df_test[features.PRICE].values) print('Mean absolute error: {}'.format(m_a_e))
def save_load_model(): prostate = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=[0.5]) prostate_glm.train(x=["AGE","RACE","PSA","DCAPS"], y="CAPSULE", training_frame=prostate) path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(prostate_glm, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) the_model = h2o.load_model(model_path) assert isinstance(the_model, H2OEstimator), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def get_prediction(model_path, iters, data, batch_size): h2o.init(nthreads=-1) model = h2o.load_model(model_path) prediction = [] warnings.filterwarnings("ignore") t1= datetime.datetime.now() for i in range(0, iters+1): _data = data.ix[(i*batch_size):((i+1)*batch_size)] _data_h20 = h2o.H2OFrame(_data) pred = model.predict(_data_h20) prediction += list(pred.as_data_frame(use_pandas=True)['predict']) t2 = datetime.datetime.now() print("total run time :", round((t2 - t1).total_seconds() / 60, 2)) h2o.shutdown(prompt = False) return prediction
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50, 100), 1)[0] max_depth1 = random.sample(range(2, 6), 1)[0] min_rows1 = random.sample(range(10, 16), 1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = H2ORandomForestEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, seed=1234) model1.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = H2ORandomForestEstimator( ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, checkpoint=restored_model._id, seed=1234 ) model2.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) # build the equivalent of model 2 in one shot model3 = H2ORandomForestEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, seed=1234) model3.train(x=range(1, milsong_train.ncol), y=0, training_frame=milsong_train, validation_frame=milsong_valid) assert isinstance(model2, type(model3)) assert model2.mse(valid=True) == model3.mse( valid=True ), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format( model2.mse(valid=True), model3.mse(valid=True) )
def clv_clustering_scoring(new_customer_details): # Reading data from the sample csv file print("Processing Step 1 --> Reading in the sample data") all_data = pd.read_csv('..\\..\\99_sample_data\\custclv.csv') #h2o.shutdown() h2o.init(ip="127.0.0.1", https=True, insecure=True) # initializing h2o predictors = [ 'rfm_score', 'Gender', 'Age', 'HouseType', 'ContactAvailability', 'HomeCountry', 'CreditScore', 'CLV', 'MonthlyValue', 'ActiveMonths' ] # Data Cleaning all_data['Gender'] = pd.Categorical(all_data.Gender).codes all_data['ContactAvailability'] = pd.Categorical( all_data.ContactAvailability).codes all_data['HouseType'] = pd.Categorical(all_data.HouseType).codes all_data['HomeCountry'] = pd.Categorical(all_data.HomeCountry).codes all_data_h20 = h2o.H2OFrame(all_data) train, test = all_data_h20.split_frame([0.8], seed=123) train = train[:, 1:11] test = test[:, 1:11] # Loading the H2O model print("Processing Step 2 --> Loading the H2O model into the solution") estimator = h2o.load_model( "..\\..\\02_models\\KMeans_model_python_1537328280878_1") trained = estimator.predict(all_data_h20) all_data_h20['cluster'] = trained["predict"].asfactor() all_data_h20 = all_data_h20.as_data_frame() print("Processing Step 3 --> Scoring for a sample customer") sample_customer = test[2, :] print(sample_customer) predicted = estimator.predict(sample_customer) print("Predicted Cluster : ", predicted["predict"].asfactor()) # Calculate average CLV of that cluster req_value = int(predicted["predict"].asfactor()) req_data = all_data_h20[all_data_h20['cluster'] == req_value] avg_clv = req_data['CLV'].mean() print('CLV for new customer = ', str(avg_clv)) print('Process Complete') return avg_clv
def get_newest_model(): # Find newest model model_paths = [path for path in os.listdir('models/') if 'automl_model' in path] model_dates = [model_path.split('automl_model_')[1] for model_path in model_paths] # Check that there is a model in the path assert len(model_paths) > 0 # Get most recent model date and the path to the most recent model newest_model_date = max(model_dates) newest_model_idx = model_dates.index(newest_model_date) model_path_to_load = 'models/' + model_paths[newest_model_idx] # Load newest model aml = h2o.load_model(model_path_to_load) return aml
def inference(model, myCSV, threshold=0.920060452296198): model_name = f"{config.MODEL_PATH}{model}" model = h2o.load_model(model_name) h2odf = h2o.H2OFrame(pd.read_csv(myCSV), destination_frame="testData.hex") df = h2odf.as_data_frame() predictions = model.predict(h2odf) # df['alert_h2o'] = predictions.as_data_frame().predict df['Probability_COVID19'] = predictions.as_data_frame().iloc[:, 2] df['COVID19_Status'] = df['Probability_COVID19'].map( lambda x: 1 if x <= threshold else 0) df['Probability_COVID19'] = 1 - df['Probability_COVID19'] cols = df.columns.tolist() df = df[cols[-2:] + cols[:-2]] return df
def save_load_model(): prostate = h2o.import_file(h2o.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_glm = h2o.glm(y=prostate["CAPSULE"], x=prostate[["AGE","RACE","PSA","DCAPS"]], family = "binomial", alpha = [0.5]) path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results")) assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(prostate_glm, path=path, force=True) assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path) the_model = h2o.load_model(model_path) assert isinstance(the_model, H2OBinomialModel), "Expected and H2OBinomialModel, but got {0}".format(the_model)
def download_model_filename(): fr = h2o.import_file( path=pyunit_utils.locate("smalldata/prostate/prostate.csv")) model = H2OGradientBoostingEstimator(ntrees=10, seed=1234) model.train(x=list(range(2, fr.ncol)), y=1, training_frame=fr) # Default filename is model_id model_path = model.download_model() # It should be saved in server working directory assert model_path.endswith( model.model_id), "Not expected path: {0}".format(model_path) loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Default filename is model_id tmpdir = tempfile.mkdtemp() model_path = model.download_model(tmpdir) assert_equals(os.path.join(tmpdir, model.model_id), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = model.download_model(tmpdir, filename="gbm_prostate") assert_equals(os.path.join(tmpdir, "gbm_prostate"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = model.download_model(tmpdir, filename="gbm_prostate.model") assert_equals(os.path.join(tmpdir, "gbm_prostate.model"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with custom path model_path = model.download_model(tmpdir, filename=os.path.join( "not-existing-folder", "gbm_prostate.model")) assert_equals( os.path.join(tmpdir, "not-existing-folder", "gbm_prostate.model"), model_path, "Not expected path") loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator) # Custom filename with default path model_path = model.download_model(filename="gbm_prostate2.model") assert model_path.endswith( "gbm_prostate2.model"), "Not expected path: {0}".format(model_path) loaded_model = h2o.load_model(model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator)
def index(): if request.method == 'GET': return render_template('index.html') elif request.method == 'POST': loan_amnt = float(request.form.get('loan_amnt', 5000)) term = request.form.get('term', '36 months') emp_length = request.form.get('emp_length', '1 year') home_ownership = request.form.get('home_ownership', 'RENT') purpose = request.form.get('purpose', 'credit card') addr_state = request.form.get('addr_state', 'AL') annual_inc = float(request.form.get('annual_inc', 0)) inq_last_6mths = float(request.form.get('inq_last_6mths', 0)) col_names = [ 'loan_amnt', 'term', 'emp_length', 'home_ownership', 'purpose', 'addr_state', 'annual_inc', 'inq_last_6mths' ] df = h2o.H2OFrame.from_python( [(loan_amnt, term, emp_length, home_ownership, purpose, addr_state, annual_inc, inq_last_6mths)], column_names=col_names) # Convert string variables into factors string_vars = [i[0] for i in df.types.iteritems() if i[1] == 'string'] for var in string_vars: df[var] = df[var].asfactor() saved_model = h2o.load_model( '/assets/flask_deployment_demo/GBM_model_python_1515678740025_16') predicted = saved_model.predict(df) predicted_df = predicted.as_data_frame() default_prob = round(predicted_df['default'][0], 3) # Append "predicted scores" to original DF. df_predictions = df.cbind(predicted) if default_prob < 0.5: result = 'Likely to not default' else: result = 'Likely to default' return render_template('index.html', result=result, default_prob=default_prob, \ loan_amnt=loan_amnt, term=term, emp_length=emp_length, home_ownership=home_ownership, \ purpose=purpose, addr_state=addr_state, annual_inc=annual_inc, inq_last_6mths=inq_last_6mths)
def transform(self, X: dt.Frame): h2o.init(port=config.h2o_recipes_port) model_path = os.path.join(temporary_files_path, self.id) with open(model_path, "wb") as f: f.write(self.raw_model_bytes) model = h2o.load_model(os.path.abspath(model_path)) remove(model_path) frame = h2o.H2OFrame(X.to_pandas()) anomaly_frame = None try: anomaly_frame = model.anomaly(frame) anomaly_frame_df = anomaly_frame.as_data_frame(header=False) return anomaly_frame_df finally: h2o.remove(self.id) h2o.remove(anomaly_frame)
def predict_churn(State, AccountLength, AreaCode, Phone, IntlPlan, VMailPlan, VMailMessage, DayMins, DayCalls, DayCharge, EveMins, EveCalls, EveCharge, NightMins, NightCalls, NightCharge, IntlMins, IntlCalls, IntlCharge, CustServCalls): # connect to the model scoring service h2o.connect() # open the downloaded model ChurnPredictor = h2o.load_model(path='AutoML-leader') # define a feature vector to evaluate with the model newData = pd.DataFrame( { 'State': State, 'Account Length': AccountLength, 'Area Code': AreaCode, 'Phone': Phone, 'Int\'l Plan': IntlPlan, 'VMail Plan': VMailPlan, 'VMail Message': VMailMessage, 'Day Mins': DayMins, 'Day Calls': DayCalls, 'Day Charge': DayCharge, 'Eve Mins': EveMins, 'Eve Calls': EveCalls, 'Eve Charge': EveCharge, 'Night Mins': NightMins, 'Night Calls': NightCalls, 'Night Charge': NightCharge, 'Intl Mins': IntlMins, 'Intl Calls': IntlCalls, 'Intl Charge': IntlCharge, 'CustServ Calls': CustServCalls }, index=[0]) # evaluate the feature vector using the model predictions = ChurnPredictor.predict(h2o.H2OFrame(newData)) predictionsOut = h2o.as_list(predictions, use_pandas=False) prediction = predictionsOut[1][0] probabilityChurn = predictionsOut[1][1] probabilityRetain = predictionsOut[1][2] return "Prediction: " + str(prediction) + " |Probability to Churn: " + str( probabilityChurn) + " |Probability to Retain: " + str( probabilityRetain)
def deepwater_checkpoint(): if not H2ODeepWaterEstimator.available(): return ## build a model #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() print(frame.head(5)) model = H2ODeepWaterEstimator(epochs=50, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0) model.train(y=1, training_frame=frame) ## save the model model_path = h2o.save_model(model) ## delete everything - simulate cluster shutdown and restart h2o.remove_all() ## reimport the model and the frame model = h2o.load_model(model_path) #frame = h2o.import_file(pyunit_utils.locate("bigdata/laptop/deepwater/imagenet/cat_dog_mouse.csv")) frame = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) frame.drop(0) frame[1] = frame[1].asfactor() ## delete the checkpoint file os.remove(model_path) ## continue training model2 = H2ODeepWaterEstimator(epochs=100, learning_rate=1e-5, stopping_rounds=0, score_duty_cycle=1, train_samples_per_iteration=-1, score_interval=0, checkpoint=model.model_id) model2.train(y=1, training_frame=frame) model2.show()
def isolation_forest_save_and_load(): print("Isolation Forest Smoke Test") train = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/ecg_discord_train.csv")) if_model = H2OIsolationForestEstimator(ntrees=7, seed=12, sample_size=5) if_model.train(training_frame=train) path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(if_model, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) reloaded = h2o.load_model(model_path) assert isinstance(reloaded, H2OIsolationForestEstimator), "Expected and H2OIsolationForestEstimator, but got {0}"\ .format(reloaded)
def download_model(): prostate = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) prostate["CAPSULE"] = prostate["CAPSULE"].asfactor() prostate_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=10, max_depth=8, min_rows=10, learn_rate=0.2) prostate_gbm.train(x=["AGE", "RACE", "PSA", "VOL", "GLEASON"], y="CAPSULE", training_frame=prostate) path = pyunit_utils.locate("results") downloaded_model_path = prostate_gbm.download_model(path=path) assert os.path.isfile(downloaded_model_path), \ "Expected load file {0} to exist, but it does not.".format(downloaded_model_path) loaded_model = h2o.load_model(downloaded_model_path) assert isinstance(loaded_model, H2OGradientBoostingEstimator), \ "Expected an H2OGradientBoostingEstimator, but got {0}".format(downloaded_model_path)
def test_hadoop(): ''' Test H2O read and write to hdfs ''' hdfs_name_node = os.getenv("NAME_NODE") print("Importing hdfs data") h2o_data = h2o.import_file("hdfs://" + hdfs_name_node + "/datasets/100k.csv") print("Spliting data") train,test = h2o_data.split_frame(ratios=[0.1]) h2o_glm = H2OGeneralizedLinearEstimator(family="binomial", alpha=0.5, Lambda=0.01) print("Training") h2o_glm.train(x=range(1, 10), y=0, training_frame=train) # dont need to train on all features hdfs_model_path = os.getenv("MODEL_PATH") print("Saving model") new_model_path = h2o.save_model(h2o_glm, "hdfs://" + hdfs_name_node + "/" + hdfs_model_path) print("Loading back model") new_model = h2o.load_model(new_model_path)
def __init__(self, model_name, model_base_path): """ Initialize the service. Args: model_name: The name of the model. model_base_path: The file path of the model. Return: None """ super(H2oInferenceService, self).__init__() # Start the h2o server if os.path.isfile("/tmp/h2o.jar"): logging.info("Run to run command 'java -jar /tmp/h2o.jar'") subprocess.Popen(["java", "-jar", "/tmp/h2o.jar"]) logging.info("Sleep 10s to wait for h2o server") time.sleep(10) local_model_base_path = filesystem_util.download_hdfs_moels( model_base_path) self.model_name = model_name self.model_base_path = local_model_base_path self.model_version_list = [1] self.model_graph_signature = "" self.platform = "H2o" self.preprocess_function, self.postprocess_function = preprocess_util.get_preprocess_postprocess_function_from_model_path( self.model_base_path) import h2o logger.info("Try to initialize and connect the h2o server") h2o.init() logger.info("Try to load the h2o model") model = h2o.load_model(self.model_base_path) self.model = model # TODO: Update the signature with readable string self.model_graph_signature = "{}".format(self.model.full_parameters)
def milsong_checkpoint(): milsong_train = h2o.upload_file(tests.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(tests.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) # save the model, then load the model path = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","..","results")) assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isdir(model_path), "Expected load directory {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id,seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) assert isinstance(model2,type(model3)) assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def glrm_mojo(): h2o.remove_all() NTESTROWS = 200 # number of test dataset rows df = pyunit_utils.random_dataset("regression", seed=1234) # generate random dataset train = df[NTESTROWS:, :] test = df[:NTESTROWS, :] x = df.names transform_types = ["NONE", "STANDARDIZE", "NORMALIZE", "DEMEAN", "DESCALE"] transformN = transform_types[randint(0, len(transform_types)-1)] # build a GLRM model with random dataset generated earlier glrmModel = H2OGeneralizedLowRankEstimator(k=3, transform=transformN, max_iterations=10, seed=1234) glrmModel.train(x=x, training_frame=train) glrmTrainFactor = h2o.get_frame(glrmModel._model_json['output']['representation_name']) assert glrmTrainFactor.nrows==train.nrows, \ "X factor row number {0} should equal training row number {1}.".format(glrmTrainFactor.nrows, train.nrows) save_GLRM_mojo(glrmModel) # ave mojo model MOJONAME = pyunit_utils.getMojoName(glrmModel._id) TMPDIR = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath('__file__')), "..", "results", MOJONAME)) h2o.download_csv(test[x], os.path.join(TMPDIR, 'in.csv')) # save test file, h2o predict/mojo use same file pred_h2o, pred_mojo = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=True) # save mojo predict h2o.save_model(glrmModel, TMPDIR) # save GLRM model glrmModel2 = h2o.load_model(os.path.join(TMPDIR,MOJONAME)) predict_model = glrmModel2.predict(test) for col in range(pred_h2o.ncols): if pred_h2o[col].isfactor(): pred_h2o[col] = pred_h2o[col].asnumeric() predict_model[col] = predict_model[col].asnumeric() print("Comparing mojo predict and h2o predict...") pyunit_utils.compare_frames_local(pred_h2o, pred_mojo, 1, tol=1e-10) print("Comparing mojo predict and h2o predict from saved model...") pyunit_utils.compare_frames_local(pred_mojo, predict_model, 1, tol=1e-10) frameID, mojoXFactor = pyunit_utils.mojo_predict(glrmModel, TMPDIR, MOJONAME, glrmReconstruct=False) # save mojo XFactor glrmTestFactor = h2o.get_frame("GLRMLoading_"+frameID) # store the x Factor for new test dataset print("Comparing mojo x Factor and model x Factor ...") pyunit_utils.compare_frames_local(glrmTestFactor, mojoXFactor, 1, tol=1e-10)
def milsong_checkpoint(): milsong_train = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(pyunit_utils.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) distribution = "gaussian" # build first model ntrees1 = random.sample(list(range(50,100)),1)[0] max_depth1 = random.sample(list(range(2,6)),1)[0] min_rows1 = random.sample(list(range(10,16)),1)[0] print("ntrees model 1: {0}".format(ntrees1)) print("max_depth model 1: {0}".format(max_depth1)) print("min_rows model 1: {0}".format(min_rows1)) model1 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0]) # save the model, then load the model path = pyunit_utils.locate("results") assert os.path.isdir(path), "Expected save directory {0} to exist, but it does not.".format(path) model_path = h2o.save_model(model1, path=path, force=True) assert os.path.isfile(model_path), "Expected load file {0} to exist, but it does not.".format(model_path) restored_model = h2o.load_model(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print("ntrees model 2: {0}".format(ntrees2)) print("max_depth model 2: {0}".format(max_depth2)) print("min_rows model 2: {0}".format(min_rows2)) model2 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model.model_id) # build the equivalent of model 2 in one shot model3 = h2o.gbm(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, distribution=distribution,validation_x=milsong_valid[1:],validation_y=milsong_valid[0])
def test_saved_binary_model_produces_same_predictions_as_original(): ds = prepare_data(blending) base_models = train_base_models(ds) se_model = train_stacked_ensemble(ds, base_models) #Predict in ensemble in Py client preds_py = se_model.predict(ds.test) tmp_dir = tempfile.mkdtemp() try: bin_file = h2o.save_model(se_model, tmp_dir) #Load binary model and predict bin_model = h2o.load_model(pu.locate(bin_file)) preds_bin = bin_model.predict(ds.test) finally: shutil.rmtree(tmp_dir) #Predictions from model in Py and binary model should be the same pred_diff = preds_bin - preds_py assert pred_diff["p0"].max() < 1e-11 assert pred_diff["p1"].max() < 1e-11 assert pred_diff["p0"].min() > -1e-11 assert pred_diff["p1"].min() > -1e-11
def milsong_checkpoint(ip,port): milsong_train = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-train.csv.gz")) milsong_valid = h2o.upload_file(h2o.locate("bigdata/laptop/milsongs/milsongs-test.csv.gz")) # build first model ntrees1 = random.sample(range(50,100),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) # save the model, then load the model model_path = h2o.save_model(model1,force=True) restored_model = h2o.load_model(model_path) shutil.rmtree(model_path) # continue building the model ntrees2 = ntrees1 + 50 max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0], checkpoint=restored_model._id,seed=1234) # build the equivalent of model 2 in one shot model3 = h2o.random_forest(x=milsong_train[1:],y=milsong_train[0],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2, validation_x=milsong_valid[1:],validation_y=milsong_valid[0],seed=1234) assert isinstance(model2,type(model3)) assert model2.mse(valid=True)==model3.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model3.mse(valid=True))
def h2oapi(): """ Python API test: h2o.load_model(path) """ training_data = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10] model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5) model.train(x=X, y=Y, training_frame=training_data) try: results_dir = pyunit_utils.locate("results") # find directory path to results folder h2o.save_model(model, path=results_dir, force=True) # save model full_path_filename = os.path.join(results_dir, model._id) assert os.path.isfile(full_path_filename), "h2o.save_model() command is not working." model_reloaded = h2o.load_model(full_path_filename) assert_is_type(model, H2OGeneralizedLinearEstimator) assert_is_type(model_reloaded, H2OGeneralizedLinearEstimator) except Exception as e: if 'File not found' in e.args[0]: print("Directory is not writable. h2o.load_model() command is not tested.") else: assert False, "h2o.load_model() command is not working."
features = list(range(0,784)) target = 784 train[target] = train[target].asfactor() valid[target] = valid[target].asfactor() # Build model model = H2ODeepWaterEstimator(epochs=20, activation="Rectifier", hidden=[200,200], ignore_const_cols=False, mini_batch_size=256, input_dropout_ratio=0.1, hidden_dropout_ratios=[0.5,0.5], stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification", score_interval=2, score_duty_cycle=0.5, score_training_samples=1000, score_validation_samples=1000, gpu=True, seed=1234) model.train(x=features, y=target, training_frame=train, validation_frame=valid) # Evaluate model model.show() print(model.scoring_history()) # Checkpoint model model_path = h2o.save_model(model=model, force=True) # Load model model_ckpt = h2o.load_model(model_path) # Start training from checkpoint model_warm = H2ODeepWaterEstimator(checkpoint=model_ckpt.model_id, epochs=100, activation="Rectifier", hidden=[200,200], ignore_const_cols=False, mini_batch_size=256, input_dropout_ratio=0.1, hidden_dropout_ratios=[0.5,0.5], stopping_rounds=3, stopping_tolerance=0.05, stopping_metric="misclassification", score_interval=2, score_duty_cycle=0.5, score_training_samples=1000, score_validation_samples=1000, gpu=True, seed=1234) model_warm.train(x=features, y=target, training_frame=train, validation_frame=valid) # Evaluate checkpointed model model_warm.show() print(model_warm.scoring_history())
def cars_checkpoint(ip,port): cars = h2o.upload_file(h2o.locate("smalldata/junit/cars_20mpg.csv")) s = cars.runif() train = cars[s > .2] valid = cars[s <= .2] # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(range(3),1)[0] # pick the predictors and response column predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" cars[response_col] = cars[response_col].asfactor() elif problem == 2 : response_col = "cylinders" cars[response_col] = cars[response_col].asfactor() else : response_col = "economy" print "Response column: {0}".format(response_col) # build first model ntrees1 = random.sample(range(5,21),1)[0] max_depth1 = random.sample(range(2,6),1)[0] min_rows1 = random.sample(range(10,16),1)[0] print "ntrees model 1: {0}".format(ntrees1) print "max_depth model 1: {0}".format(max_depth1) print "min_rows model 1: {0}".format(min_rows1) model1 = h2o.random_forest(x=train[predictors],y=train[response_col],ntrees=ntrees1,max_depth=max_depth1, min_rows=min_rows1, validation_x=valid[predictors],validation_y=valid[response_col], seed=2345) # save the model, remove all keys, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) # continue building the model with the same max_depth and min_rows ntrees2 = ntrees1 + random.sample(range(5,21),1)[0] max_depth2 = max_depth1 min_rows2 = min_rows1 print "ntrees model 2: {0}".format(ntrees2) print "max_depth model 2: {0}".format(max_depth2) print "min_rows model 2: {0}".format(min_rows2) model2 = h2o.random_forest(x=train[predictors],y=train[response_col],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,checkpoint=restored_model._id,validation_x=valid[predictors], validation_y=valid[response_col],seed=2345) # continue building the model, but with different max_depth and min_rows (ensemble) ntrees3 = ntrees2 max_depth3 = max_depth2 + random.sample(range(3,6),1)[0] min_rows3 = min_rows2 + random.sample(range(5,10),1)[0] print "ntrees model 3: {0}".format(ntrees3) print "max_depth model 3: {0}".format(max_depth3) print "min_rows model 3: {0}".format(min_rows3) model3 = h2o.random_forest(x=train[predictors],y=train[response_col],ntrees=ntrees3,max_depth=max_depth3, min_rows=min_rows3,checkpoint=restored_model._id,validation_x=valid[predictors], validation_y=valid[response_col],seed=2345) # build the equivalent of model 2 in one shot model4 = h2o.random_forest(x=train[predictors],y=train[response_col],ntrees=ntrees2,max_depth=max_depth2, min_rows=min_rows2,validation_frame=valid,validation_x=valid[predictors], validation_y=valid[response_col],seed=2345) if problem == 0: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True) assert model3.mse(valid=True)!=model4.mse(valid=True) elif problem == 1: assert isinstance(model2,type(model4)) assert model2.auc(valid=True)==model4.auc(valid=True) assert model3.auc(valid=True)!=model4.auc(valid=True) assert model2.logloss(valid=True)==model4.logloss(valid=True) assert model3.logloss(valid=True)!=model4.logloss(valid=True) assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True) assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True) else: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True) assert model3.mse(valid=True)!=model4.mse(valid=True) assert model2.r2(valid=True)==model4.r2(valid=True) assert model3.r2(valid=True)!=model4.r2(valid=True)
def evalmodel(df): glm_classifier = h2o.load_model('./model') result = h2o.as_list(glm_classifier.predict(df), use_pandas = False) result.pop(0) #get rid of the column header result = [float(r[0]) for r in result] #the results are each returned as 1-element lists. fix that. return result
def load(location): """Loads a persisted state of an instance of H2OPipeline from disk. This method will handle loading H2OEstimator models separately and outside of the constraints of the pickle package. Note that this is a static method and should be called accordingly: >>> def load_pipe(): ... return H2OPipeline.load('path/to/h2o/pipeline.pkl') # GOOD! >>> >>> pipe = load_pipe() # doctest: +SKIP Also note that since H2OPipeline can contain an H2OEstimator, it's ``load`` functionality differs from that of its superclass, BaseH2OFunctionWrapper and will not function properly if called at the highest level of abstraction: >>> def load_pipe(): ... return BaseH2OFunctionWrapper.load('path/to/h2o/pipeline.pkl') # BAD! >>> >>> pipe = load_pipe() # doctest: +SKIP Furthermore, trying to load a different type of BaseH2OFunctionWrapper from this method will raise a TypeError: >>> def load_pipe(): ... return H2OPipeline.load('path/to/some/other/transformer.pkl') # BAD! >>> >>> pipe = load_pipe() # doctest: +SKIP Parameters ---------- location : str The location where the persisted H2OPipeline model resides. Returns ------- model : H2OPipeline The unpickled instance of the H2OPipeline model """ with open(location, 'rb') as f: model = pickle.load(f) if not isinstance(model, H2OPipeline): raise TypeError('expected H2OPipeline, got %s' % type(model)) # if the pipe didn't end in an h2o estimator, we don't need to # do the following IO segment... ends_in_h2o = hasattr(model, 'model_loc_') if ends_in_h2o: # read the model portion, delete the model path ex = None for pth in [model.model_loc_, 'hdfs://%s' % model.model_loc_]: try: the_h2o_model = h2o.load_model(pth) except Exception as e: if ex is None: ex = e else: # only throws if fails twice raise ex model.steps[-1] = (model.est_name_, the_h2o_model) return model
def load(location): """Loads a persisted state of an instance of BaseH2OSearchCV from disk. This method will handle loading H2OEstimator models separately and outside of the constraints of the pickle package. Note that this is a static method and should be called accordingly: >>> def load_search(): ... return BaseH2OSearchCV.load('path/to/h2o/search.pkl') # GOOD! >>> >>> search = load_search() # doctest: +SKIP Also note that since BaseH2OSearchCV will contain an H2OEstimator, it's ``load`` functionality differs from that of its superclass, BaseH2OFunctionWrapper and will not function properly if called at the highest level of abstraction: >>> def load_search(): ... return BaseH2OFunctionWrapper.load('path/to/h2o/search.pkl') # BAD! >>> >>> search = load_search() # doctest: +SKIP Furthermore, trying to load a different type of BaseH2OFunctionWrapper from this method will raise a TypeError: >>> def load_search(): ... return BaseH2OSearchCV.load('path/to/some/other/transformer.pkl') # BAD! >>> >>> search = load_search() # doctest: +SKIP Parameters ---------- location : str The location where the persisted BaseH2OSearchCV model resides. Returns ------- model : BaseH2OSearchCV The unpickled instance of the BaseH2OSearchCV model """ with open(location, 'rb') as f: model = pickle.load(f) if not isinstance(model, BaseH2OSearchCV): raise TypeError('expected BaseH2OSearchCV, got %s' % type(model)) # read the model portion, delete the model path ex = None the_h2o_est = None for pth in [model.model_loc_, 'hdfs://%s' % model.model_loc_]: try: the_h2o_est = h2o.load_model(pth) except Exception as e: if ex is None: ex = e else: # only throws if fails twice raise ex # break if successfully loaded if the_h2o_est is not None: break # if self.estimator is None, then it's simply the H2OEstimator, # otherwise it's going to be the H2OPipeline if model.best_estimator_ is None: model.best_estimator_ = the_h2o_est model.estimator = _new_base_estimator(model.est_type_, model.base_estimator_parms_) else: model.best_estimator_.steps[-1] = (model.est_name_, the_h2o_est) model.estimator.steps[-1] = ( model.est_name_, _new_base_estimator(model.est_type_, model.base_estimator_parms_)) return model
def cars_checkpoint(): cars = h2o.upload_file(pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) s = cars.runif() train = cars[s > .2] valid = cars[s <= .2] print("\n*** Description (chunk distribution, etc) of training frame:") train.describe() print("\n*** Description (chunk distribution, etc) of validation frame:") valid.describe() # choose the type model-building exercise (multinomial classification or regression). 0:regression, 1:binomial, # 2:multinomial problem = random.sample(list(range(3)),1)[0] # pick the predictors and response column, along with the correct distribution predictors = ["displacement","power","weight","acceleration","year"] if problem == 1 : response_col = "economy_20mpg" distribution = "bernoulli" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() elif problem == 2 : response_col = "cylinders" distribution = "multinomial" train[response_col] = train[response_col].asfactor() valid[response_col] = valid[response_col].asfactor() else : response_col = "economy" distribution = "gaussian" print("\n*** Distribution: {0}".format(distribution)) print("\n*** Response column: {0}".format(response_col)) # build first model ntrees1 = 5 max_depth1 = random.sample(list(range(2,6)),1)[0] min_rows1 = random.sample(list(range(10,16)),1)[0] print("\n*** Building model 1 with the following parameters:") print("*** ntrees model 1: {0}".format(ntrees1)) print("*** max_depth model 1: {0}".format(max_depth1)) print("*** min_rows model 1: {0}".format(min_rows1)) from h2o.estimators.gbm import H2OGradientBoostingEstimator model1 = H2OGradientBoostingEstimator(ntrees=ntrees1, max_depth=max_depth1, min_rows=min_rows1, score_each_iteration=True, distribution=distribution) model1.train(x=predictors, y=response_col,training_frame=train,validation_frame=valid) # model1 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees1, # max_depth=max_depth1, # min_rows=min_rows1, # score_each_iteration=True, # distribution=distribution, # validation_x=valid[predictors], # validation_y=valid[response_col]) # save the model, then load the model model_path = h2o.save_model(model1, name="delete_model", force=True) restored_model = h2o.load_model(model_path) shutil.rmtree("delete_model") # continue building the model ntrees2 = ntrees1 + 5 max_depth2 = max_depth1 min_rows2 = min_rows1 print("\n*** Continuing to build model 1 (now called model 2) with the following parameters:") print("*** ntrees model 2: {0}".format(ntrees2)) print("*** max_depth model 2: {0}".format(max_depth2)) print("*** min_rows model 2: {0}".format(min_rows2)) model2 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True, checkpoint=restored_model._id) model2.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) # model2 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees2, # max_depth=max_depth2, # min_rows=min_rows2, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col], # checkpoint=restored_model._id) # continue building the model, but with different number of trees ntrees3 = ntrees2 + 50 max_depth3 = max_depth1 min_rows3 = min_rows1 print("\n*** Continuing to build model 1 (now called model 3) with the following parameters:") print("*** ntrees model 3: {0}".format(ntrees3)) print("*** max_depth model 3: {0}".format(max_depth3)) print("*** min_rows model 3: {0}".format(min_rows3)) model3 = H2OGradientBoostingEstimator(ntrees=ntrees3, max_depth=max_depth3, min_rows=min_rows3, distribution=distribution, score_each_iteration=True, checkpoint=restored_model._id) model3.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) # model3 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees3, # max_depth=max_depth3, # min_rows=min_rows3, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col], # checkpoint=restored_model._id) # build the equivalent of model 2 in one shot print("\n*** Building the equivalent of model 2 (called model 4) in one shot:") model4 = H2OGradientBoostingEstimator(ntrees=ntrees2, max_depth=max_depth2, min_rows=min_rows2, distribution=distribution, score_each_iteration=True) model4.train(x=predictors,y=response_col,training_frame=train,validation_frame=valid) # model4 = h2o.gbm(x=train[predictors], # y=train[response_col], # ntrees=ntrees2, # max_depth=max_depth2, # min_rows=min_rows2, # distribution=distribution, # score_each_iteration=True, # validation_x=valid[predictors], # validation_y=valid[response_col]) print("\n*** Model Summary for model 2:") print(model2.summary()) print("\n*** Model Summary for model 3:") print(model3.summary()) print("\n*** Model Summary for model 4:") print(model4.summary()) print("\n*** Score History for model 2:") print(model2.scoring_history()) print("\n*** Score History for model 3:") print(model3.scoring_history()) print("\n*** Score History for model 4:") print(model4.scoring_history()) # checks if problem == 0: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) elif problem == 1: assert isinstance(model2,type(model4)) assert model2.auc(valid=True)==model4.auc(valid=True), "Expected Model 2 AUC: {0} to be the same as Model 4 AUC: {1}".format(model2.auc(valid=True), model4.auc(valid=True)) #assert model3.auc(valid=True)!=model4.auc(valid=True), "Expected Model 3 AUC: {0} to be different from Model 4 AUC: {1}".format(model3.auc(valid=True), model4.auc(valid=True)) assert model2.logloss(valid=True)==model4.logloss(valid=True), "Expected Model 2 Log Loss: {0} to be the same as Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) #assert model3.logloss(valid=True)!=model4.logloss(valid=True), "Expected Model 3 Log Loss: {0} to be different from Model 4 Log Loss: {1}".format(model2.logloss(valid=True), model4.logloss(valid=True)) assert model2.giniCoef(valid=True)==model4.giniCoef(valid=True), "Expected Model 2 Gini Coef {0} to be the same as Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) #assert model3.giniCoef(valid=True)!=model4.giniCoef(valid=True), "Expected Model 3 Gini Coef: {0} to be different from Model 4 Gini Coef: {1}".format(model2.giniCoef(valid=True), model4.giniCoef(valid=True)) else: assert isinstance(model2,type(model4)) assert model2.mse(valid=True)==model4.mse(valid=True), "Expected Model 2 MSE: {0} to be the same as Model 4 MSE: {1}".format(model2.mse(valid=True), model4.mse(valid=True)) #assert model3.mse(valid=True)!=model4.mse(valid=True), "Expected Model 3 MSE: {0} to be different from Model 4 MSE: {1}".format(model3.mse(valid=True), model4.mse(valid=True)) assert model2.r2(valid=True)==model4.r2(valid=True), "Expected Model 2 R2: {0} to be the same as Model 4 R2: {1}".format(model2.r2(valid=True), model4.r2(valid=True))
import h2o h2o.init() md = h2o.load_model("1-train+model/test_airline_GBM_100k") dx_test = h2o.import_file("https://s3.amazonaws.com/benchm-ml--main/test.csv") dx_test1 = dx_test[0,0:8] dx_test1.as_data_frame() %time md.predict(dx_test1).as_data_frame() ## h2o frame ## Wall time: 120 ms dpy_test1 = dx_test1.as_data_frame() ## py object (pandas) %time md.predict(h2o.H2OFrame.from_python(dpy_test1, column_names = ["Month","DayofMonth","DayOfWeek","DepTime","UniqueCarrier","Origin","Dest","Distance"])).as_data_frame() ## Wall time: 255 ms # %time dx_test1_2 = h2o.H2OFrame.from_python(dpy_test1, column_names = ["Month","DayofMonth","DayOfWeek","DepTime","UniqueCarrier","Origin","Dest","Distance"]) # ## Wall time: 134 ms # dx_test1_2.as_data_frame() # %time md.predict(dx_test1_2).as_data_frame() # ## Wall time: 121 ms