def pca_export(): print("###### PCA ######") frame = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv")) model = H2OPrincipalComponentAnalysisEstimator(k=3, impute_missing=True) model.train(x=list(range(4)), training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) expect_error(model.download_mojo, model="PCA", format='MOJO')
def k_means_export(): print("###### K MEANS ######") frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv")) model = H2OKMeansEstimator(k=1) model.train(x=list(range(frame.ncol)), training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) model.download_mojo(path=RESULT_DIR)
def naive_bayes_export(): print("###### NAIVE BAYES ######") frame = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) model = H2ONaiveBayesEstimator(laplace=0.25) model.train(x=list(range(4)), y=4, training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) expect_error(model.download_mojo, model="Naive Bayes", format='MOJO')
def download_pojo(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) print("iris:") iris.show() m = H2OGradientBoostingEstimator() m.train(x=list(range(4)), y=4, training_frame=iris) h2o.download_pojo(m)
def h2odownload_pojo(): """ Python API test: h2o.download_pojo(model, path=u'', get_jar=True) Copied from glm_download_pojo.py """ try: h2o_df = h2o.import_file( pyunit_utils.locate("smalldata/prostate/prostate.csv")) h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor() binomial_fit = H2OGeneralizedLinearEstimator(family="binomial") binomial_fit.train(y="CAPSULE", x=["AGE", "RACE", "PSA", "GLEASON"], training_frame=h2o_df) try: results_dir = pyunit_utils.locate( "results") # find directory path to results folder h2o.download_pojo(binomial_fit, path=results_dir) assert os.path.isfile(os.path.join(results_dir, "h2o-genmodel.jar")), "h2o.download_pojo() " \ "command is not working." except: h2o.download_pojo( binomial_fit ) # just print pojo to screen if directory does not exists except Exception as e: assert False, "h2o.download_pojo() command is not working."
def train_model(train, test, feature_col, target_col, model_type, outdir): """ Train a LR model or GBM model """ train[target_col] = train[target_col].asfactor() test[target_col] = test[target_col].asfactor() if model_type == "lr": model = H2OGeneralizedLinearEstimator( model_id='titanic_model', family='binomial', seed=1234) elif model_type == "gbm": model = H2OGradientBoostingEstimator( model_id='titanic_model') else: raise Exception('specify model type: lr or gbm') model.train(x=feature_col, y=target_col, training_frame=train, validation_frame=test, model_id='titanic_model') # save pojo h2o.download_pojo(model, outdir) print model return model, train
def drf_export(): print("###### DRF ######") frame = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/bigcat_5000x2.csv")) frame["y"] = frame["y"].asfactor() model = H2ORandomForestEstimator(ntrees=1, max_depth=1, nbins=100, nbins_cats=10) model.train(x="X", y="y", training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) model.download_mojo(path=RESULT_DIR)
def hexdev_422(): fr = h2o.import_file(h2o.locate("bigdata/laptop/jira/z_repro.csv.gz")) fr[0] = fr[0].asfactor() rf = h2o.random_forest(x=fr[1:fr.ncol], y=fr[0], min_rows=1, ntrees=25, max_depth=45) h2o.download_pojo(rf)
def download_pojo(self, path=""): """ Download the POJO for this model to the directory specified by path (no trailing slash!). If path is "", then dump to screen. :param model: Retrieve this model's scoring POJO. :param path: An absolute path to the directory where POJO should be saved. :return: None """ h2o.download_pojo(self, path) # call the "package" function
def download_pojo(): iris = h2o.import_file( path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() m = h2o.gbm(x=iris[:4], y=iris[4]) h2o.download_pojo(m)
def download_pojo(self,path=""): """ Download the POJO for this model to the directory specified by path (no trailing slash!). If path is "", then dump to screen. :param model: Retrieve this model's scoring POJO. :param path: An absolute path to the directory where POJO should be saved. :return: None """ h2o.download_pojo(self,path) # call the "package" function
def glm_export(): print("###### GLM ######") frame = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv")) Y = 3 X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10] model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5) model.train(x=X, y=Y, training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) model.download_mojo(path=RESULT_DIR)
def download_pojo(ip, port): # Connect to h2o h2o.init(ip, port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() m = h2o.gbm(x=iris[:4], y=iris[4]) h2o.download_pojo(m)
def download_pojo(ip,port): # Connect to h2o h2o.init(ip,port) iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() m = h2o.gbm(x=iris[:4],y=iris[4]) h2o.download_pojo(m)
def download_pojo(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) print "iris:" iris.show() m = h2o.gbm(x=iris[:4],y=iris[4]) h2o.download_pojo(m)
def javapredict(algo, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) else: raise(ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id)) os.makedirs(tmpdir) h2o.download_pojo(model,path=tmpdir) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() h2o.download_csv(predictions,os.path.join(tmpdir,"out_h2o.csv")) print "Setting up for Java POJO" h2o.download_csv(test[x],os.path.join(tmpdir,"in.csv")) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(os.path.join(tmpdir,"in.csv"), 'r+') in_csv = f.read() in_csv = re.sub('\"', '', in_csv) f.seek(0) f.write(in_csv) f.truncate() f.close() subprocess.call(["javac", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar"), "-J-Xmx4g", "-J-XX:MaxPermSize=256m", os.path.join(tmpdir,model._id+".java")], stderr=subprocess.STDOUT) subprocess.call(["java", "-ea", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar")+":{0}".format(tmpdir), "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", os.path.join(tmpdir,"in.csv"), "--output", os.path.join(tmpdir,"out_pojo.csv")], stderr=subprocess.STDOUT) predictions2 = h2o.import_file(os.path.join(tmpdir,"out_pojo.csv")) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc) # Value for r in range(hr): hp = predictions[r,0] if algo == "gbm": pp = float.fromhex(predictions2[r,0]) assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp) elif algo == "random_forest": pp = predictions2[r,0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp) else: raise(ValueError, "algo {0} is not supported".format(algo))
def gbm_export(): print("###### GBM ######") frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv")) frame["CAPSULE"] = frame["CAPSULE"].asfactor() model = H2OGradientBoostingEstimator(ntrees=100, learn_rate=0.1, max_depth=5, min_rows=10, distribution="bernoulli") model.train(x=list(range(1, frame.ncol)), y="CAPSULE", training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) model.download_mojo(path=RESULT_DIR)
def download_pojo(self, path=""): """ Download the POJO for this model to the directory specified by path. If path is "", then dump to screen. :param path: An absolute path to the directory where POJO should be saved. :returns: None """ path = path.rstrip("/") h2o.download_pojo(self, path)
def save_model(self, tmp_dir): """ Saves the model :param loc: :return: """ # Downloading the pojo and the JAR file try: h2o.download_pojo(self._model, path=tmp_dir, get_jar=True) except Exception as e: #logger.error('Failed to save the model: '+ str(e)) pass
def write_model_pojo(model): """ Write the model as POJO :param model: trained model :return: None """ # Relative path from code dir output_directory = "build" if not os.path.exists(output_directory): os.makedirs(output_directory) h2o.download_pojo(model, path=output_directory)
def download_pojo(self, path="", get_jar=True): """This method is injected at runtime if the ``best_estimator_`` is an instance of an ``H2OEstimator``. This method downloads the POJO from a fit estimator. Parameters ---------- path : string, optional (default="") Path to folder in which to save the POJO. get_jar : bool, optional (default=True) Whether to get the jar from the POJO. Returns ------- None or string Returns None if ``path`` is "" else, the filepath where the POJO was saved. """ is_h2o = isinstance(self.best_estimator_, H2OEstimator) if is_h2o: return h2o.download_pojo(self.best_estimator_, path=path, get_jar=get_jar) else: return self.best_estimator_.download_pojo(path=path, get_jar=get_jar)
def prostate_pojo_import(): prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv")) prostate = prostate.drop("ID") prostate['CAPSULE'] = prostate['CAPSULE'].asfactor() model = H2OGradientBoostingEstimator() model.train( y="CAPSULE", training_frame=prostate ) sandbox_dir = pyunit_utils.locate("results") pojo_path = h2o.download_pojo(model, path=sandbox_dir) model_imported = h2o.import_mojo(pojo_path) print(model_imported) # 1. check scoring preds_original = model.predict(prostate) preds_imported = model_imported.predict(prostate) assert_frame_equal(preds_original.as_data_frame(), preds_imported.as_data_frame()) # 2. check we can get PDPs pdp_original = model.partial_plot(data=prostate, cols=['AGE'], server=True, plot=False) pdp_imported = model_imported.partial_plot(data=prostate, cols=['AGE'], server=True, plot=False) assert_frame_equal(pdp_original[0].as_data_frame(), pdp_imported[0].as_data_frame())
def h2odownload_pojo(): """ Python API test: h2o.download_pojo(model, path=u'', get_jar=True) Copied from glm_download_pojo.py """ h2o_df = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv")) h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor() binomial_fit = H2OGeneralizedLinearEstimator(family = "binomial") binomial_fit.train(y = "CAPSULE", x = ["AGE", "RACE", "PSA", "GLEASON"], training_frame = h2o_df) try: results_dir = pyunit_utils.locate("results") # find directory path to results folder h2o.download_pojo(binomial_fit,path=results_dir) assert os.path.isfile(os.path.join(results_dir, "h2o-genmodel.jar")), "h2o.download_pojo() " \ "command is not working." except: h2o.download_pojo(binomial_fit) # just print pojo to screen if directory does not exists
def train(cfg): # Load data messages = load_data(cfg.datafile) # Prepare tf-idf to feature vectorization and also transform input data (vectorizer, train) = tf_idf(messages['message']) # Save Tf-Idf model h2o.init() train_table = h2o.H2OFrame(np.column_stack((messages['label'], train.toarray()))).set_names(['label'] + vectorizer.get_feature_names()) gbm_model= H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.01, max_depth=6, min_rows=10, distribution="bernoulli") gbm_model.train(x = range(1, train_table.shape[1]), y = 0, training_frame = train_table) if cfg.verbose: print "GBM Model", gbm_model # Save models if not os.path.exists(cfg.models_dir): os.makedirs(cfg.models_dir) saveModel(vectorizer, '{}/vectorizer.pickle'.format(cfg.models_dir)) h2o.download_pojo(gbm_model, "{}/".format(cfg.models_dir)) h2o.shutdown()
def deeplearning_export(): print("###### DEEPLEARNING ######") frame = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv")) problem = random.sample(list(range(3)), 1)[0] predictors = ["displacement", "power", "weight", "acceleration", "year"] if problem == 1: response_col = "economy_20mpg" frame[response_col] = frame[response_col].asfactor() elif problem == 2: response_col = "cylinders" frame[response_col] = frame[response_col].asfactor() else: response_col = "economy" print("Response column: {0}".format(response_col)) model = H2ODeepLearningEstimator(nfolds=random.randint(3, 10), fold_assignment="Modulo", hidden=[20, 20], epochs=10) model.train(x=predictors, y=response_col, training_frame=frame) h2o.download_pojo(model, path=RESULT_DIR) model.download_mojo(path=RESULT_DIR)
def download_pojo(): iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv")) # Compensate slash at the end model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=list(range(4)), y=4, training_frame=iris) export_dir = pyunit_utils.locate("results") + "/downloadable_pojo" h2o.download_pojo(model=model, path=export_dir) assert os.path.isdir(export_dir) assert os.path.exists(os.path.join(export_dir, model.model_id + '.java')) # Slash present at the end model = H2OGradientBoostingEstimator(ntrees=1) model.train(x=list(range(4)), y=4, training_frame=iris) export_dir = pyunit_utils.locate("results") + "/downloadable_pojo/" h2o.download_pojo(model=model, path=export_dir) assert os.path.isdir(export_dir) assert os.path.exists(os.path.join(export_dir, model.model_id + '.java'))
def download_pojo(self, path="", get_genmodel_jar=False, genmodel_name=""): """ Download the POJO for the leader model in AutoML to the directory specified by path. If path is an empty string, then dump the output to screen. :param path: An absolute path to the directory where POJO should be saved. :param get_genmodel_jar: if True, then also download h2o-genmodel.jar and store it in folder ``path``. :param genmodel_name: Custom name of genmodel jar :returns: name of the POJO file written. """ return h2o.download_pojo(self.leader, path, get_jar=get_genmodel_jar, jar_name=genmodel_name)
def download_pojo(self, path="", get_genmodel_jar=False, genmodel_name=""): """ Download the POJO for the leader model in AutoML to the directory specified by path. If path is an empty string, then dump the output to screen. :param path: An absolute path to the directory where POJO should be saved. :param get_genmodel_jar: if True, then also download h2o-genmodel.jar and store it in folder ``path``. :param genmodel_name Custom name of genmodel jar :returns: name of the POJO file written. """ return h2o.download_pojo(self.leader, path, get_jar=get_genmodel_jar, jar_name=genmodel_name)
def download_pojo(self, path="", get_genmodel_jar=False): """ Download the POJO for this model to the directory specified by path. If path is "", then dump to screen. :param path: An absolute path to the directory where POJO should be saved. :returns: name of the POJO file written. """ assert_is_type(path, str) assert_is_type(get_genmodel_jar, bool) path = path.rstrip("/") return h2o.download_pojo(self, path, get_jar=get_genmodel_jar)
def test_zipped_rf_model(): """ Test the correctness of the "zipped" model format. This test will create a random dataset, split into training/testing part, train a DRF model on it, download the model's data, score the model remotely and fetch the predictions, score the model locally by running the genmodel jar, and finally compare the prediction results. """ genmodel_jar = os.path.abspath("../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar") assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar target_dir = "" if sys.platform == "win32": target_dir = tempfile.mkdtemp() else: target_dir = os.path.expanduser("~/Downloads/") report = [] for estimator in [H2ORandomForestEstimator, H2OGradientBoostingEstimator]: print(colorama.Fore.LIGHTYELLOW_EX + "\n#================================================") print("# Estimator: " + estimator.__name__) print("#================================================\n" + colorama.Fore.RESET) estimator_name = "GBM" if estimator == H2OGradientBoostingEstimator else "DRF" for problem in ["binomial", "multinomial", "regression"]: print("========================") print("%s problem" % problem.capitalize()) print("========================") df = random_dataset(problem, verbose=False) print("Created dataset with %d rows x %d columns" % (df.nrow, df.ncol)) test = df[:NTESTROWS, :] train = df[NTESTROWS:, :] test2 = test.rbind(test) time0 = time.time() print("\n\nTraining Random Forest model...") model = estimator(ntrees=NTREES, max_depth=DEPTH) model.train(training_frame=train) print(model.summary()) print("Time taken = %.3fs" % (time.time() - time0)) print("\nSaving the model...") time0 = time.time() model_file = h2o.api("GET /3/Models/%s/data" % model.model_id, save_to=target_dir) print(" => %s (%d bytes)" % (model_file, os.stat(model_file).st_size)) assert os.path.exists(model_file) print("Time taken = %.3fs" % (time.time() - time0)) print("\nDownloading POJO...") time0 = time.time() pojo_file = h2o.download_pojo(model, target_dir, get_jar=False) pojo_size = os.stat(pojo_file).st_size pojo_name = os.path.splitext(os.path.basename(pojo_file))[0] print(" => %s (%d bytes)" % (pojo_file, pojo_size)) print("Time taken = %.3fs" % (time.time() - time0)) print("\nDownloading the test datasets for local use: ", end="") time0 = time.time() test_file = os.path.join(target_dir, "test_%s.csv" % test.frame_id) test2_file = os.path.join(target_dir, "test2_%s.csv" % test2.frame_id) print(test_file) h2o.download_csv(test, test_file) h2o.download_csv(test2, test2_file) print("Time taken = %.3fs" % (time.time() - time0)) print("\nScoring the model remotely and downloading to file ", end="") times = [time.time()] h2o_pred_file = os.path.join(target_dir, "predR_%s.csv" % test.frame_id) h2o_pred_file2 = os.path.join(target_dir, "predR_%s.csv" % test2.frame_id) print(h2o_pred_file) for testframe, outfile in [(test, h2o_pred_file), (test2, h2o_pred_file2)]: predictions = model.predict(testframe) h2o.download_csv(predictions, outfile) times.append(time.time()) print("Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "Server", times[1] - times[0], times[2] - times[1])) print("\nScoring the model locally and saving to file ", end="") times = [time.time()] local_pred_file = os.path.join(target_dir, "predL_%s.csv" % test.frame_id) local_pred_file2 = os.path.join(target_dir, "predL_%s.csv" % test2.frame_id) print(local_pred_file) for inpfile, outfile in [(test_file, local_pred_file), (test2_file, local_pred_file2)]: load_csv(inpfile) ret = subprocess.call(["java", "-cp", genmodel_jar, "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--input", inpfile, "--output", outfile, "--model", model_file, "--decimal"]) assert ret == 0, "GenModel finished with return code %d" % ret times.append(time.time()) print("Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "Zipped", times[1] - times[0], times[2] - times[1])) if pojo_size <= 1000 << 20: # 1000 Mb time0 = time.time() print("\nCompiling Java Pojo") javac_cmd = ["javac", "-cp", genmodel_jar, "-J-Xmx12g", pojo_file] subprocess.check_call(javac_cmd) print("Time taken = %.3fs" % (time.time() - time0)) pojo_pred_file = os.path.join(target_dir, "predP_%s.csv" % test.frame_id) pojo_pred_file2 = os.path.join(target_dir, "predP_%s.csv" % test2.frame_id) print("Scoring POJO and saving to file %s" % pojo_pred_file) times = [time.time()] cp_sep = ";" if sys.platform == "win32" else ":" for inpfile, outfile in [(test_file, pojo_pred_file), (test2_file, pojo_pred_file2)]: load_csv(inpfile) java_cmd = ["java", "-cp", cp_sep.join([genmodel_jar, target_dir]), "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "-XX:MaxPermSize=256m", "hex.genmodel.tools.PredictCsv", "--pojo", pojo_name, "--input", inpfile, "--output", outfile, "--decimal"] ret = subprocess.call(java_cmd) assert ret == 0, "GenModel finished with return code %d" % ret times.append(time.time()) print("Time taken = %.3fs (1st run: %.3f, 2nd run: %.3f)" % (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1])) report.append((estimator_name, problem, "POJO", times[1] - times[0], times[2] - times[1])) print("\nChecking whether the predictions coincide...") time0 = time.time() local_pred = load_csv(local_pred_file) server_pred = load_csv(h2o_pred_file) pojo_pred = load_csv(pojo_pred_file) if pojo_pred_file else local_pred assert len(local_pred) == len(server_pred) == len(pojo_pred) == test.nrow, \ "Number of rows in prediction files do not match: %d vs %d vs %d vs %d" % \ (len(local_pred), len(server_pred), len(pojo_pred), test.nrow) for i in range(test.nrow): lpred = local_pred[i] rpred = server_pred[i] ppred = pojo_pred[i] assert type(lpred) == type(rpred) == type(ppred), \ "Types of predictions do not match: %r / %r / %r" % (lpred, rpred, ppred) if isinstance(lpred, float): same = abs(lpred - rpred) + abs(lpred - ppred) < 1e-8 else: same = lpred == rpred == ppred assert same, \ "Predictions are different for row %d: local=%r, pojo=%r, bomo=%r" % (i + 1, lpred, ppred, rpred) print("Time taken = %.3fs" % (time.time() - time0)) print(colorama.Fore.LIGHTGREEN_EX + "\nPredictions match!\n" + colorama.Fore.RESET) print(colorama.Fore.LIGHTYELLOW_EX + "\n\n#================================================") print("# Timing report") print("#================================================\n" + colorama.Fore.RESET) print(tabulate.tabulate(report, headers=["Model", "Problem type", "Scorer", "10000 rows", "20000 rows"], floatfmt=".3f"), end="\n\n\n")
training_columns = list(p_filter.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') training_columns.remove('BIN') h_filter['BIN'] = h_filter['BIN'].asfactor() h_test['BIN'] = h_test['BIN'].asfactor() h2o.export_file(frame=h_test, path='test_sid.csv', force=True) h2o.export_file(frame=h_filter, path='train_sid.csv', force=True) model = H2ORandomForestEstimator(nbins=250, ntress=100, max_depth=50, nfolds=10) model.train(x=training_columns, y='BIN', training_frame=h_filter) predict = model.predict(test_data=h_test) predict = DataFrameParser.h2oToList(predict['predict']) actual = DataFrameParser.h2oToList(h_test['BIN']) Measures.confusion_matrix(actual, predict) print(predict) print(actual) h2o.download_pojo(model=model, path="/home/wso2123/PycharmProjects/FeatureProcessor/", get_jar=True)
def javapredict(algo, equality, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) elif algo == "deeplearning": model = h2o.deeplearning(x=train[x], y=train[y], **kwargs) elif algo == "glm": model = h2o.glm(x=train[x], y=train[y], **kwargs) else: raise(ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id)) os.mkdir(tmpdir) h2o.download_pojo(model,path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar") assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar) java_file = os.path.join(tmpdir,model._id+".java") assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file) print "java code saved in {0}".format(java_file) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv) print "H2O Predictions saved in {0}".format(out_h2o_csv) print "Setting up for Java POJO" in_csv = os.path.join(tmpdir,"in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print "Input CSV to PredictCsv saved in {0}".format(in_csv) print "Compiling Java Pojo" javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m", java_file] subprocess.check_call(javac_cmd) print "Running PredictCsv Java Program" out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", in_csv, "--output", out_pojo_csv] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print "Java output: {0}".format(o) assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv) predictions2 = h2o.import_file(path=out_pojo_csv) print "Pojo predictions saved in {0}".format(out_pojo_csv) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc) # Value for r in range(hr): hp = predictions[r,0] if equality == "numeric": pp = float.fromhex(predictions2[r,0]) assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp) elif equality == "class": pp = predictions2[r,0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp) else: raise(ValueError, "equality type {0} is not supported".format(equality))
def javapredict(algo, equality, train, test, x, y, **kwargs): print "Creating model in H2O" if algo == "gbm": model = h2o.gbm(x=train[x], y=train[y], **kwargs) elif algo == "random_forest": model = h2o.random_forest(x=train[x], y=train[y], **kwargs) elif algo == "deeplearning": model = h2o.deeplearning(x=train[x], y=train[y], **kwargs) elif algo == "glm": model = h2o.glm(x=train[x], y=train[y], **kwargs) else: raise (ValueError, "algo {0} is not supported".format(algo)) print model print "Downloading Java prediction model code from H2O" tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "results", model._id)) os.mkdir(tmpdir) h2o.download_pojo(model, path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar") assert os.path.exists( h2o_genmodel_jar ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar) java_file = os.path.join(tmpdir, model._id + ".java") assert os.path.exists( java_file), "Expected file {0} to exist, but it does not.".format( java_file) print "java code saved in {0}".format(java_file) print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists( out_h2o_csv), "Expected file {0} to exist, but it does not.".format( out_h2o_csv) print "H2O Predictions saved in {0}".format(out_h2o_csv) print "Setting up for Java POJO" in_csv = os.path.join(tmpdir, "in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists( in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print "Input CSV to PredictCsv saved in {0}".format(in_csv) print "Compiling Java Pojo" javac_cmd = [ "javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m", java_file ] subprocess.check_call(javac_cmd) print "Running PredictCsv Java Program" out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = [ "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", in_csv, "--output", out_pojo_csv ] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print "Java output: {0}".format(o) assert os.path.exists( out_pojo_csv), "Expected file {0} to exist, but it does not.".format( out_pojo_csv) predictions2 = h2o.import_file(path=out_pojo_csv) print "Pojo predictions saved in {0}".format(out_pojo_csv) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format( hr, pr) assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format( hc, pc) # Value for r in range(hr): hp = predictions[r, 0] if equality == "numeric": pp = float.fromhex(predictions2[r, 0]) assert abs( hp - pp ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format( r, hp, pp) elif equality == "class": pp = predictions2[r, 0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format( r, hp, pp) else: raise (ValueError, "equality type {0} is not supported".format(equality))
type(_) h2o.import_file? h2o.parse_setup? cluster_estimator = H2OKMeansEstimator(k=2) cluster_estimator.train? cluster_estimator.train(x=['size','numberofrecords'], y='FilteredFilename', training_frame=h2o_df, verbose=False) h2o.download_pojo(cluster_estimator) cluster_estimator #h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor() #model = h2o.glm(y = "CAPSULE", # x = ["AGE", "RACE", "PSA", "GLEASON"], # training_frame = h2o_df, # family = "binomial") #h2o.download_pojo(model) #============================================================================== # binary logistic #==============================================================================
def exportPojo(): h2o.download_pojo(iot.dl, path="../predictions/src/main/java/") # Write th threshold to a properties file print("threshold=" + toString(threshold), file='../predictions/src/main/resources/dl.properties')
import h2o h2o.init() model = h2o.load_model("GBM_ForLoanPredict.zip") h2o.download_pojo(model, path='./', get_jar=True)
def javapredict(algo, equality, train, test, x, y, compile_only=False, **kwargs): print("Creating model in H2O") if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs) elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs) elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs) elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs) elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs) elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs) elif algo == "pca": model = H2OPCA(**kwargs) else: raise ValueError if algo == "kmeans" or algo == "pca": model.train(x=x, training_frame=train) else: model.train(x=x, y=y, training_frame=train) print(model) # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means. # TODO: clients should extract Java class name from header. regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]") pojoname = regex.sub("_",model._id) print("Downloading Java prediction model code from H2O") tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",pojoname)) os.mkdir(tmpdir) h2o.download_pojo(model,path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar") assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print("h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)) java_file = os.path.join(tmpdir,pojoname+".java") assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file) print("java code saved in {0}".format(java_file)) print("Compiling Java Pojo") javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", "-J-XX:MaxPermSize=256m", java_file] subprocess.check_call(javac_cmd) if not compile_only: print("Predicting in H2O") predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv) print("H2O Predictions saved in {0}".format(out_h2o_csv)) print("Setting up for Java POJO") in_csv = os.path.join(tmpdir,"in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv) print("Input CSV to PredictCsv saved in {0}".format(in_csv)) print("Running PredictCsv Java Program") out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx12g", "-XX:MaxPermSize=2g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname, "--input", in_csv, "--output", out_pojo_csv] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print("Java output: {0}".format(o)) assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv) predictions2 = h2o.upload_file(path=out_pojo_csv) print("Pojo predictions saved in {0}".format(out_pojo_csv)) print("Comparing predictions between H2O and Java POJO") # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(hr, pr) assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(hc, pc) # Value for r in range(hr): hp = predictions[r,0] if equality == "numeric": pp = float.fromhex(predictions2[r,0]) assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp) elif equality == "class": pp = predictions2[r,0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp) else: raise ValueError
def javapredict(algo, equality, train, test, x, y, compile_only=False, **kwargs): print "Creating model in H2O" if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs) elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs) elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs) elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs) elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs) elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs) elif algo == "pca": model = H2OPCA(**kwargs) else: raise (ValueError, "algo {0} is not supported".format(algo)) if algo == "kmeans" or algo == "pca": model.train(x=x, training_frame=train) else: model.train(x=x, y=y, training_frame=train) print model # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means. # TODO: clients should extract Java class name from header. regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]") pojoname = regex.sub("_", model._id) print "Downloading Java prediction model code from H2O" tmpdir = os.path.normpath( os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "results", pojoname)) os.mkdir(tmpdir) h2o.download_pojo(model, path=tmpdir) h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar") assert os.path.exists( h2o_genmodel_jar ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar) print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar) java_file = os.path.join(tmpdir, pojoname + ".java") assert os.path.exists( java_file), "Expected file {0} to exist, but it does not.".format( java_file) print "java code saved in {0}".format(java_file) print "Compiling Java Pojo" javac_cmd = [ "javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", "-J-XX:MaxPermSize=256m", java_file ] subprocess.check_call(javac_cmd) if not compile_only: print "Predicting in H2O" predictions = model.predict(test) predictions.summary() predictions.head() out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv") h2o.download_csv(predictions, out_h2o_csv) assert os.path.exists( out_h2o_csv ), "Expected file {0} to exist, but it does not.".format(out_h2o_csv) print "H2O Predictions saved in {0}".format(out_h2o_csv) print "Setting up for Java POJO" in_csv = os.path.join(tmpdir, "in.csv") h2o.download_csv(test[x], in_csv) # hack: the PredictCsv driver can't handle quoted strings, so remove them f = open(in_csv, 'r+') csv = f.read() csv = re.sub('\"', '', csv) f.seek(0) f.write(csv) f.truncate() f.close() assert os.path.exists( in_csv), "Expected file {0} to exist, but it does not.".format( in_csv) print "Input CSV to PredictCsv saved in {0}".format(in_csv) print "Running PredictCsv Java Program" out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv") cp_sep = ";" if sys.platform == "win32" else ":" java_cmd = [ "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx12g", "-XX:MaxPermSize=2g", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname, "--input", in_csv, "--output", out_pojo_csv ] p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT) o, e = p.communicate() print "Java output: {0}".format(o) assert os.path.exists( out_pojo_csv ), "Expected file {0} to exist, but it does not.".format(out_pojo_csv) predictions2 = h2o.upload_file(path=out_pojo_csv) print "Pojo predictions saved in {0}".format(out_pojo_csv) print "Comparing predictions between H2O and Java POJO" # Dimensions hr, hc = predictions.dim pr, pc = predictions2.dim assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format( hr, pr) assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format( hc, pc) # Value for r in range(hr): hp = predictions[r, 0] if equality == "numeric": pp = float.fromhex(predictions2[r, 0]) assert abs( hp - pp ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format( r, hp, pp) elif equality == "class": pp = predictions2[r, 0] assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format( r, hp, pp) else: raise (ValueError, "equality type {0} is not supported".format(equality))
import h2o h2o.init() df = h2o.import_file('data/titanic.csv') df['pclass'] = df['pclass'].asfactor() df['survived'] = df['survived'].asfactor() model = h2o.gbm(y = 'survived', x = ['pclass', 'sex', 'age', 'fare'], training_frame = df, model_id = 'MyModel') h2o.download_pojo(model, path = 'tmp')
# calculate metrics binomial_fit.model_performance(test) # remove response column inorder to test # use threshold for max than f1 newdata = test newdata['CAPSULE'] = None newpred = binomial_fit.predict(newdata) newpred # manually define threshold for predictions to 0.3 import pandas as pd pred = binomial_fit.predict(h2o_df) pred['predict'] = pred['p1'] > 0.3 # POJO visualization object h2o_df = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv") h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor() binomial_fit = H2OGeneralizedLinearEstimator(family ="binomial") binomial_fit.train(y = "CAPSULE", x = ["AGE", "RACE","PSA", "GLEASON"], training_frame = h2o_df) h2o.download_pojo(binomial_fit) # Verifying model results h2o_df = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip") model = H2OGeneralizedLinearEstimator(family = "binomial", nfolds = 5) model.train(y = "IsDepDelayed", x = ["Year", "Origin"], training_frame=h2o_df) print("full model training auc:", model.auc()) print("full model validation auc:", model.auc(xval=True)) for model_ in model.get_xval_models(): print(model_.model_id, "training auc:", model_.auc(), "validation auc:", model_.auc(valid=True))
import h2o from h2o.estimators.glm import H2OGeneralizedLinearEstimator h2o.init() h2o_df = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv") h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor() binomial_fit = H2OGeneralizedLinearEstimator(family="binomial") binomial_fit.train(y="CAPSULE", x=["AGE", "RACE", "PSA", "GLEASON"], training_frame=h2o_df) h2o.download_pojo(binomial_fit)