def pca_export():
    print("###### PCA ######")
    frame = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    model = H2OPrincipalComponentAnalysisEstimator(k=3, impute_missing=True)
    model.train(x=list(range(4)), training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    expect_error(model.download_mojo, model="PCA", format='MOJO')
def k_means_export():
    print("###### K MEANS ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    model = H2OKMeansEstimator(k=1)
    model.train(x=list(range(frame.ncol)), training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
def pca_export():
    print("###### PCA ######")
    frame = h2o.upload_file(pyunit_utils.locate("smalldata/pca_test/USArrests.csv"))
    model = H2OPrincipalComponentAnalysisEstimator(k=3, impute_missing=True)
    model.train(x=list(range(4)), training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    expect_error(model.download_mojo, model="PCA", format='MOJO')
def naive_bayes_export():
    print("###### NAIVE BAYES ######")
    frame = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    model = H2ONaiveBayesEstimator(laplace=0.25)
    model.train(x=list(range(4)), y=4, training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    expect_error(model.download_mojo, model="Naive Bayes", format='MOJO')
def naive_bayes_export():
    print("###### NAIVE BAYES ######")
    frame = h2o.upload_file(pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    model = H2ONaiveBayesEstimator(laplace=0.25)
    model.train(x=list(range(4)), y=4, training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    expect_error(model.download_mojo, model="Naive Bayes", format='MOJO')
def k_means_export():
    print("###### K MEANS ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/benign.csv"))
    model = H2OKMeansEstimator(k=1)
    model.train(x=list(range(frame.ncol)), training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
def download_pojo():
  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  print("iris:")
  iris.show()
  m = H2OGradientBoostingEstimator()
  m.train(x=list(range(4)), y=4, training_frame=iris)
  h2o.download_pojo(m)
Beispiel #8
0
def h2odownload_pojo():
    """
    Python API test: h2o.download_pojo(model, path=u'', get_jar=True)

    Copied from glm_download_pojo.py
    """
    try:
        h2o_df = h2o.import_file(
            pyunit_utils.locate("smalldata/prostate/prostate.csv"))
        h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor()
        binomial_fit = H2OGeneralizedLinearEstimator(family="binomial")
        binomial_fit.train(y="CAPSULE",
                           x=["AGE", "RACE", "PSA", "GLEASON"],
                           training_frame=h2o_df)
        try:
            results_dir = pyunit_utils.locate(
                "results")  # find directory path to results folder
            h2o.download_pojo(binomial_fit, path=results_dir)
            assert os.path.isfile(os.path.join(results_dir, "h2o-genmodel.jar")), "h2o.download_pojo() " \
                                                                                  "command is not working."
        except:
            h2o.download_pojo(
                binomial_fit
            )  # just print pojo to screen if directory does not exists
    except Exception as e:
        assert False, "h2o.download_pojo() command is not working."
def train_model(train, test, feature_col, target_col, model_type, outdir):
	"""
	Train a LR model or GBM model
	"""
	train[target_col] = train[target_col].asfactor()
	test[target_col] = test[target_col].asfactor()

	if model_type == "lr":
		model = H2OGeneralizedLinearEstimator(
			model_id='titanic_model',
			family='binomial',
			seed=1234)
	elif model_type == "gbm":
		model = H2OGradientBoostingEstimator(
			model_id='titanic_model')
	else:
		raise Exception('specify model type: lr or gbm')

	model.train(x=feature_col, y=target_col, training_frame=train, validation_frame=test, model_id='titanic_model')

	# save pojo
	h2o.download_pojo(model, outdir)

	print model
	return model, train
def drf_export():
    print("###### DRF ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
    frame["y"] = frame["y"].asfactor()
    model = H2ORandomForestEstimator(ntrees=1, max_depth=1, nbins=100, nbins_cats=10)
    model.train(x="X", y="y", training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
def hexdev_422():

    fr = h2o.import_file(h2o.locate("bigdata/laptop/jira/z_repro.csv.gz"))
    fr[0] = fr[0].asfactor()

    rf = h2o.random_forest(x=fr[1:fr.ncol], y=fr[0], min_rows=1, ntrees=25, max_depth=45)

    h2o.download_pojo(rf)
def drf_export():
    print("###### DRF ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/gbm_test/bigcat_5000x2.csv"))
    frame["y"] = frame["y"].asfactor()
    model = H2ORandomForestEstimator(ntrees=1, max_depth=1, nbins=100, nbins_cats=10)
    model.train(x="X", y="y", training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
 def download_pojo(self, path=""):
     """
 Download the POJO for this model to the directory specified by path (no trailing slash!).
 If path is "", then dump to screen.
 :param model: Retrieve this model's scoring POJO.
 :param path:  An absolute path to the directory where POJO should be saved.
 :return: None
 """
     h2o.download_pojo(self, path)  # call the "package" function
def download_pojo():

    iris = h2o.import_file(
        path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    m = h2o.gbm(x=iris[:4], y=iris[4])
    h2o.download_pojo(m)
Beispiel #15
0
 def download_pojo(self,path=""):
   """
   Download the POJO for this model to the directory specified by path (no trailing slash!).
   If path is "", then dump to screen.
   :param model: Retrieve this model's scoring POJO.
   :param path:  An absolute path to the directory where POJO should be saved.
   :return: None
   """
   h2o.download_pojo(self,path)  # call the "package" function
def glm_export():
    print("###### GLM ######")
    frame = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
    Y = 3
    X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]
    model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5)
    model.train(x=X, y=Y, training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
def glm_export():
    print("###### GLM ######")
    frame = h2o.import_file(pyunit_utils.locate("smalldata/logreg/benign.csv"))
    Y = 3
    X = [0, 1, 2, 4, 5, 6, 7, 8, 9, 10]
    model = H2OGeneralizedLinearEstimator(family="binomial", alpha=0, Lambda=1e-5)
    model.train(x=X, y=Y, training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
Beispiel #18
0
def download_pojo(ip, port):
    # Connect to h2o
    h2o.init(ip, port)

    iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
    print "iris:"
    iris.show()

    m = h2o.gbm(x=iris[:4], y=iris[4])
    h2o.download_pojo(m)
def download_pojo(ip,port):
  # Connect to h2o
  h2o.init(ip,port)

  iris = h2o.import_frame(path=h2o.locate("smalldata/iris/iris_wheader.csv"))
  print "iris:"
  iris.show()

  m = h2o.gbm(x=iris[:4],y=iris[4])
  h2o.download_pojo(m)
def download_pojo():
  
  

  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
  print "iris:"
  iris.show()

  m = h2o.gbm(x=iris[:4],y=iris[4])
  h2o.download_pojo(m)
Beispiel #21
0
def javapredict(algo, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    else:
        raise(ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.realpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id))
    os.makedirs(tmpdir)
    h2o.download_pojo(model,path=tmpdir)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    h2o.download_csv(predictions,os.path.join(tmpdir,"out_h2o.csv"))

    print "Setting up for Java POJO"
    h2o.download_csv(test[x],os.path.join(tmpdir,"in.csv"))
    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(os.path.join(tmpdir,"in.csv"), 'r+')
    in_csv = f.read()
    in_csv = re.sub('\"', '', in_csv)
    f.seek(0)
    f.write(in_csv)
    f.truncate()
    f.close()

    subprocess.call(["javac", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar"), "-J-Xmx4g", "-J-XX:MaxPermSize=256m", os.path.join(tmpdir,model._id+".java")], stderr=subprocess.STDOUT)
    subprocess.call(["java", "-ea", "-cp", os.path.join(tmpdir,"h2o-genmodel.jar")+":{0}".format(tmpdir), "-Xmx4g", "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id, "--input", os.path.join(tmpdir,"in.csv"), "--output", os.path.join(tmpdir,"out_pojo.csv")], stderr=subprocess.STDOUT)

    predictions2 = h2o.import_file(os.path.join(tmpdir,"out_pojo.csv"))

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r,0]
        if algo == "gbm":
            pp = float.fromhex(predictions2[r,0])
            assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp)
        elif algo == "random_forest":
            pp = predictions2[r,0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp)
        else:
            raise(ValueError, "algo {0} is not supported".format(algo))
def gbm_export():
    print("###### GBM ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))
    frame["CAPSULE"] = frame["CAPSULE"].asfactor()
    model = H2OGradientBoostingEstimator(ntrees=100, learn_rate=0.1,
                                         max_depth=5,
                                         min_rows=10,
                                         distribution="bernoulli")
    model.train(x=list(range(1, frame.ncol)), y="CAPSULE", training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
def gbm_export():
    print("###### GBM ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate_train.csv"))
    frame["CAPSULE"] = frame["CAPSULE"].asfactor()
    model = H2OGradientBoostingEstimator(ntrees=100, learn_rate=0.1,
                                         max_depth=5,
                                         min_rows=10,
                                         distribution="bernoulli")
    model.train(x=list(range(1, frame.ncol)), y="CAPSULE", training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
Beispiel #24
0
    def download_pojo(self, path=""):
        """
        Download the POJO for this model to the directory specified by path.

        If path is "", then dump to screen.

        :param path:  An absolute path to the directory where POJO should be saved.

        :returns: None
        """
        path = path.rstrip("/")
        h2o.download_pojo(self, path)
Beispiel #25
0
 def save_model(self, tmp_dir):
     """
     Saves the model
     :param loc:
     :return:
     """
     # Downloading the pojo and the JAR file
     try:
         h2o.download_pojo(self._model, path=tmp_dir, get_jar=True)
     except Exception as e:
         #logger.error('Failed to save the model: '+ str(e))
         pass
Beispiel #26
0
def write_model_pojo(model):
    """
    Write the model as POJO
    :param model: trained model
    :return: None
    """

    # Relative path from code dir
    output_directory = "build"

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    h2o.download_pojo(model, path=output_directory)
Beispiel #27
0
    def download_pojo(self, path="", get_jar=True):
        """This method is injected at runtime if the ``best_estimator_``
        is an instance of an ``H2OEstimator``. This method downloads the POJO
        from a fit estimator.

        Parameters
        ----------

        path : string, optional (default="")
            Path to folder in which to save the POJO.
            
        get_jar : bool, optional (default=True)
            Whether to get the jar from the POJO.

        Returns
        -------

        None or string
            Returns None if ``path`` is "" else, the filepath
            where the POJO was saved.
        """
        is_h2o = isinstance(self.best_estimator_, H2OEstimator)
        if is_h2o:
            return h2o.download_pojo(self.best_estimator_, path=path, get_jar=get_jar)
        else:
            return self.best_estimator_.download_pojo(path=path, get_jar=get_jar)
Beispiel #28
0
    def download_pojo(self, path="", get_jar=True):
        """This method is injected at runtime if the ``best_estimator_``
        is an instance of an ``H2OEstimator``. This method downloads the POJO
        from a fit estimator.

        Parameters
        ----------

        path : string, optional (default="")
            Path to folder in which to save the POJO.
            
        get_jar : bool, optional (default=True)
            Whether to get the jar from the POJO.

        Returns
        -------

        None or string
            Returns None if ``path`` is "" else, the filepath
            where the POJO was saved.
        """
        is_h2o = isinstance(self.best_estimator_, H2OEstimator)
        if is_h2o:
            return h2o.download_pojo(self.best_estimator_,
                                     path=path,
                                     get_jar=get_jar)
        else:
            return self.best_estimator_.download_pojo(path=path,
                                                      get_jar=get_jar)
def prostate_pojo_import():
    prostate = h2o.import_file(path=pyunit_utils.locate("smalldata/logreg/prostate.csv"))
    prostate = prostate.drop("ID")
    prostate['CAPSULE'] = prostate['CAPSULE'].asfactor()

    model = H2OGradientBoostingEstimator()
    model.train(
        y="CAPSULE",
        training_frame=prostate
    )
    
    sandbox_dir = pyunit_utils.locate("results")
    pojo_path = h2o.download_pojo(model, path=sandbox_dir)

    model_imported = h2o.import_mojo(pojo_path)
    print(model_imported)

    # 1. check scoring
    preds_original = model.predict(prostate)
    preds_imported = model_imported.predict(prostate)
    assert_frame_equal(preds_original.as_data_frame(), preds_imported.as_data_frame())

    # 2. check we can get PDPs
    pdp_original = model.partial_plot(data=prostate, cols=['AGE'], server=True, plot=False)
    pdp_imported = model_imported.partial_plot(data=prostate, cols=['AGE'], server=True, plot=False)
    assert_frame_equal(pdp_original[0].as_data_frame(), pdp_imported[0].as_data_frame())
def h2odownload_pojo():
    """
    Python API test: h2o.download_pojo(model, path=u'', get_jar=True)

    Copied from glm_download_pojo.py
    """
    h2o_df = h2o.import_file(pyunit_utils.locate("smalldata/prostate/prostate.csv"))
    h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor()
    binomial_fit = H2OGeneralizedLinearEstimator(family = "binomial")
    binomial_fit.train(y = "CAPSULE", x = ["AGE", "RACE", "PSA", "GLEASON"], training_frame = h2o_df)
    try:
        results_dir = pyunit_utils.locate("results")    # find directory path to results folder
        h2o.download_pojo(binomial_fit,path=results_dir)
        assert os.path.isfile(os.path.join(results_dir, "h2o-genmodel.jar")), "h2o.download_pojo() " \
                                                                              "command is not working."
    except:
        h2o.download_pojo(binomial_fit)     # just print pojo to screen if directory does not exists
Beispiel #31
0
def train(cfg):
    # Load data
    messages = load_data(cfg.datafile)
    # Prepare tf-idf to feature vectorization and also transform input data
    (vectorizer, train) = tf_idf(messages['message'])
    # Save Tf-Idf model
    h2o.init()
    train_table = h2o.H2OFrame(np.column_stack((messages['label'], train.toarray()))).set_names(['label'] + vectorizer.get_feature_names())
    gbm_model= H2OGradientBoostingEstimator(ntrees=10, learn_rate=0.01, max_depth=6, min_rows=10, distribution="bernoulli")
    gbm_model.train(x = range(1, train_table.shape[1]), y = 0, training_frame = train_table)
    if cfg.verbose: print "GBM Model", gbm_model
    # Save models
    if not os.path.exists(cfg.models_dir):
        os.makedirs(cfg.models_dir)
    saveModel(vectorizer, '{}/vectorizer.pickle'.format(cfg.models_dir))
    h2o.download_pojo(gbm_model, "{}/".format(cfg.models_dir))
    h2o.shutdown()
def deeplearning_export():
    print("###### DEEPLEARNING ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    problem = random.sample(list(range(3)), 1)[0]
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        frame[response_col] = frame[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        frame[response_col] = frame[response_col].asfactor()
    else:
        response_col = "economy"
    print("Response column: {0}".format(response_col))
    model = H2ODeepLearningEstimator(nfolds=random.randint(3, 10), fold_assignment="Modulo", hidden=[20, 20], epochs=10)
    model.train(x=predictors, y=response_col, training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
def deeplearning_export():
    print("###### DEEPLEARNING ######")
    frame = h2o.import_file(path=pyunit_utils.locate("smalldata/junit/cars_20mpg.csv"))
    problem = random.sample(list(range(3)), 1)[0]
    predictors = ["displacement", "power", "weight", "acceleration", "year"]
    if problem == 1:
        response_col = "economy_20mpg"
        frame[response_col] = frame[response_col].asfactor()
    elif problem == 2:
        response_col = "cylinders"
        frame[response_col] = frame[response_col].asfactor()
    else:
        response_col = "economy"
    print("Response column: {0}".format(response_col))
    model = H2ODeepLearningEstimator(nfolds=random.randint(3, 10), fold_assignment="Modulo", hidden=[20, 20], epochs=10)
    model.train(x=predictors, y=response_col, training_frame=frame)
    h2o.download_pojo(model, path=RESULT_DIR)
    model.download_mojo(path=RESULT_DIR)
Beispiel #34
0
def download_pojo():
    iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))
    
    
    # Compensate slash at the end
    model = H2OGradientBoostingEstimator(ntrees=1)
    model.train(x=list(range(4)), y=4, training_frame=iris)

    export_dir = pyunit_utils.locate("results") + "/downloadable_pojo"
    h2o.download_pojo(model=model, path=export_dir)
    assert os.path.isdir(export_dir)
    assert os.path.exists(os.path.join(export_dir, model.model_id + '.java'))
    
    # Slash present at the end
    model = H2OGradientBoostingEstimator(ntrees=1)
    model.train(x=list(range(4)), y=4, training_frame=iris)
    export_dir = pyunit_utils.locate("results") + "/downloadable_pojo/"
    h2o.download_pojo(model=model, path=export_dir)
    assert os.path.isdir(export_dir)
    assert os.path.exists(os.path.join(export_dir, model.model_id + '.java'))
Beispiel #35
0
    def download_pojo(self, path="", get_genmodel_jar=False, genmodel_name=""):
        """
        Download the POJO for the leader model in AutoML to the directory specified by path.

        If path is an empty string, then dump the output to screen.

        :param path:  An absolute path to the directory where POJO should be saved.
        :param get_genmodel_jar: if True, then also download h2o-genmodel.jar and store it in folder ``path``.
        :param genmodel_name: Custom name of genmodel jar
        :returns: name of the POJO file written.
        """
        return h2o.download_pojo(self.leader, path, get_jar=get_genmodel_jar, jar_name=genmodel_name)
Beispiel #36
0
    def download_pojo(self, path="", get_genmodel_jar=False, genmodel_name=""):
        """
        Download the POJO for the leader model in AutoML to the directory specified by path.

        If path is an empty string, then dump the output to screen.

        :param path:  An absolute path to the directory where POJO should be saved.
        :param get_genmodel_jar: if True, then also download h2o-genmodel.jar and store it in folder ``path``.
        :param genmodel_name Custom name of genmodel jar
        :returns: name of the POJO file written.
        """

        return h2o.download_pojo(self.leader, path, get_jar=get_genmodel_jar, jar_name=genmodel_name)
Beispiel #37
0
    def download_pojo(self, path="", get_genmodel_jar=False):
        """
        Download the POJO for this model to the directory specified by path.

        If path is "", then dump to screen.

        :param path:  An absolute path to the directory where POJO should be saved.

        :returns: name of the POJO file written.
        """
        assert_is_type(path, str)
        assert_is_type(get_genmodel_jar, bool)
        path = path.rstrip("/")
        return h2o.download_pojo(self, path, get_jar=get_genmodel_jar)
Beispiel #38
0
def test_zipped_rf_model():
    """
    Test the correctness of the "zipped" model format.

    This test will create a random dataset, split into training/testing part, train a DRF model on it,
    download the model's data, score the model remotely and fetch the predictions, score the model locally by
    running the genmodel jar, and finally compare the prediction results.
    """
    genmodel_jar = os.path.abspath("../../../h2o-genmodel/build/libs/h2o-genmodel-all.jar")
    assert os.path.exists(genmodel_jar), "Cannot find " + genmodel_jar

    target_dir = ""
    if sys.platform == "win32":
        target_dir = tempfile.mkdtemp()
    else:
        target_dir = os.path.expanduser("~/Downloads/")

    report = []
    for estimator in [H2ORandomForestEstimator, H2OGradientBoostingEstimator]:
        print(colorama.Fore.LIGHTYELLOW_EX + "\n#================================================")
        print("#  Estimator: " + estimator.__name__)
        print("#================================================\n" + colorama.Fore.RESET)
        estimator_name = "GBM" if estimator == H2OGradientBoostingEstimator else "DRF"
        for problem in ["binomial", "multinomial", "regression"]:
            print("========================")
            print("%s problem" % problem.capitalize())
            print("========================")
            df = random_dataset(problem, verbose=False)
            print("Created dataset with %d rows x %d columns" % (df.nrow, df.ncol))
            test = df[:NTESTROWS, :]
            train = df[NTESTROWS:, :]
            test2 = test.rbind(test)

            time0 = time.time()
            print("\n\nTraining Random Forest model...")
            model = estimator(ntrees=NTREES, max_depth=DEPTH)
            model.train(training_frame=train)
            print(model.summary())
            print("Time taken = %.3fs" % (time.time() - time0))

            print("\nSaving the model...")
            time0 = time.time()
            model_file = h2o.api("GET /3/Models/%s/data" % model.model_id, save_to=target_dir)
            print("    => %s  (%d bytes)" % (model_file, os.stat(model_file).st_size))
            assert os.path.exists(model_file)
            print("Time taken = %.3fs" % (time.time() - time0))

            print("\nDownloading POJO...")
            time0 = time.time()
            pojo_file = h2o.download_pojo(model, target_dir, get_jar=False)
            pojo_size = os.stat(pojo_file).st_size
            pojo_name = os.path.splitext(os.path.basename(pojo_file))[0]
            print("    => %s  (%d bytes)" % (pojo_file, pojo_size))
            print("Time taken = %.3fs" % (time.time() - time0))

            print("\nDownloading the test datasets for local use: ", end="")
            time0 = time.time()
            test_file = os.path.join(target_dir, "test_%s.csv" % test.frame_id)
            test2_file = os.path.join(target_dir, "test2_%s.csv" % test2.frame_id)
            print(test_file)
            h2o.download_csv(test, test_file)
            h2o.download_csv(test2, test2_file)
            print("Time taken = %.3fs" % (time.time() - time0))

            print("\nScoring the model remotely and downloading to file ", end="")
            times = [time.time()]
            h2o_pred_file = os.path.join(target_dir, "predR_%s.csv" % test.frame_id)
            h2o_pred_file2 = os.path.join(target_dir, "predR_%s.csv" % test2.frame_id)
            print(h2o_pred_file)
            for testframe, outfile in [(test, h2o_pred_file), (test2, h2o_pred_file2)]:
                predictions = model.predict(testframe)
                h2o.download_csv(predictions, outfile)
                times.append(time.time())
            print("Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                  (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
            report.append((estimator_name, problem, "Server", times[1] - times[0], times[2] - times[1]))

            print("\nScoring the model locally and saving to file ", end="")
            times = [time.time()]
            local_pred_file = os.path.join(target_dir, "predL_%s.csv" % test.frame_id)
            local_pred_file2 = os.path.join(target_dir, "predL_%s.csv" % test2.frame_id)
            print(local_pred_file)
            for inpfile, outfile in [(test_file, local_pred_file), (test2_file, local_pred_file2)]:
                load_csv(inpfile)
                ret = subprocess.call(["java", "-cp", genmodel_jar,
                                       "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m",
                                       "hex.genmodel.tools.PredictCsv",
                                       "--input", inpfile, "--output", outfile, "--model", model_file, "--decimal"])
                assert ret == 0, "GenModel finished with return code %d" % ret
                times.append(time.time())
            print("Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                  (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
            report.append((estimator_name, problem, "Zipped", times[1] - times[0], times[2] - times[1]))

            if pojo_size <= 1000 << 20:  # 1000 Mb
                time0 = time.time()
                print("\nCompiling Java Pojo")
                javac_cmd = ["javac", "-cp", genmodel_jar, "-J-Xmx12g", pojo_file]
                subprocess.check_call(javac_cmd)
                print("Time taken = %.3fs" % (time.time() - time0))

                pojo_pred_file = os.path.join(target_dir, "predP_%s.csv" % test.frame_id)
                pojo_pred_file2 = os.path.join(target_dir, "predP_%s.csv" % test2.frame_id)
                print("Scoring POJO and saving to file %s" % pojo_pred_file)
                times = [time.time()]
                cp_sep = ";" if sys.platform == "win32" else ":"
                for inpfile, outfile in [(test_file, pojo_pred_file), (test2_file, pojo_pred_file2)]:
                    load_csv(inpfile)
                    java_cmd = ["java", "-cp", cp_sep.join([genmodel_jar, target_dir]),
                                "-ea", "-Xmx12g", "-XX:ReservedCodeCacheSize=256m", "-XX:MaxPermSize=256m",
                                "hex.genmodel.tools.PredictCsv",
                                "--pojo", pojo_name, "--input", inpfile, "--output", outfile, "--decimal"]
                    ret = subprocess.call(java_cmd)
                    assert ret == 0, "GenModel finished with return code %d" % ret
                    times.append(time.time())
                print("Time taken = %.3fs   (1st run: %.3f, 2nd run: %.3f)" %
                      (times[2] + times[0] - 2 * times[1], times[1] - times[0], times[2] - times[1]))
                report.append((estimator_name, problem, "POJO", times[1] - times[0], times[2] - times[1]))


            print("\nChecking whether the predictions coincide...")
            time0 = time.time()
            local_pred = load_csv(local_pred_file)
            server_pred = load_csv(h2o_pred_file)
            pojo_pred = load_csv(pojo_pred_file) if pojo_pred_file else local_pred
            assert len(local_pred) == len(server_pred) == len(pojo_pred) == test.nrow, \
                "Number of rows in prediction files do not match: %d vs %d vs %d vs %d" % \
                (len(local_pred), len(server_pred), len(pojo_pred), test.nrow)
            for i in range(test.nrow):
                lpred = local_pred[i]
                rpred = server_pred[i]
                ppred = pojo_pred[i]
                assert type(lpred) == type(rpred) == type(ppred), \
                    "Types of predictions do not match: %r / %r / %r" % (lpred, rpred, ppred)
                if isinstance(lpred, float):
                    same = abs(lpred - rpred) + abs(lpred - ppred) < 1e-8
                else:
                    same = lpred == rpred == ppred
                assert same, \
                    "Predictions are different for row %d: local=%r, pojo=%r, bomo=%r" % (i + 1, lpred, ppred, rpred)
            print("Time taken = %.3fs" % (time.time() - time0))
            print(colorama.Fore.LIGHTGREEN_EX + "\nPredictions match!\n" + colorama.Fore.RESET)

    print(colorama.Fore.LIGHTYELLOW_EX + "\n\n#================================================")
    print("#  Timing report")
    print("#================================================\n" + colorama.Fore.RESET)
    print(tabulate.tabulate(report,
          headers=["Model", "Problem type", "Scorer", "10000 rows", "20000 rows"],
          floatfmt=".3f"), end="\n\n\n")
Beispiel #39
0
training_columns = list(p_filter.columns)
training_columns.remove('UnitNumber')
training_columns.remove('Time')
training_columns.remove('RUL')
training_columns.remove('BIN')

h_filter['BIN'] = h_filter['BIN'].asfactor()
h_test['BIN'] = h_test['BIN'].asfactor()

h2o.export_file(frame=h_test, path='test_sid.csv', force=True)
h2o.export_file(frame=h_filter, path='train_sid.csv', force=True)

model = H2ORandomForestEstimator(nbins=250,
                                 ntress=100,
                                 max_depth=50,
                                 nfolds=10)
model.train(x=training_columns, y='BIN', training_frame=h_filter)

predict = model.predict(test_data=h_test)
predict = DataFrameParser.h2oToList(predict['predict'])
actual = DataFrameParser.h2oToList(h_test['BIN'])

Measures.confusion_matrix(actual, predict)
print(predict)
print(actual)

h2o.download_pojo(model=model,
                  path="/home/wso2123/PycharmProjects/FeatureProcessor/",
                  get_jar=True)
def javapredict(algo, equality, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    elif algo == "deeplearning":
        model = h2o.deeplearning(x=train[x], y=train[y], **kwargs)
    elif algo == "glm":
        model = h2o.glm(x=train[x], y=train[y], **kwargs)
    else:
        raise(ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",model._id))
    os.mkdir(tmpdir)
    h2o.download_pojo(model,path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar")
    assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir,model._id+".java")
    assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file)
    print "java code saved in {0}".format(java_file)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv")
    h2o.download_csv(predictions, out_h2o_csv)
    assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv)
    print "H2O Predictions saved in {0}".format(out_h2o_csv)

    print "Setting up for Java POJO"
    in_csv = os.path.join(tmpdir,"in.csv")
    h2o.download_csv(test[x], in_csv)

    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(in_csv, 'r+')
    csv = f.read()
    csv = re.sub('\"', '', csv)
    f.seek(0)
    f.write(csv)
    f.truncate()
    f.close()
    assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
    print "Input CSV to PredictCsv saved in {0}".format(in_csv)

    print "Compiling Java Pojo"
    javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m", java_file]
    subprocess.check_call(javac_cmd)

    print "Running PredictCsv Java Program"
    out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv")
    cp_sep = ";" if sys.platform == "win32" else ":"
    java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g", "-XX:MaxPermSize=256m",
                "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id,
                "--input", in_csv, "--output", out_pojo_csv]
    p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
    o, e = p.communicate()
    print "Java output: {0}".format(o)
    assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv)
    predictions2 = h2o.import_file(path=out_pojo_csv)
    print "Pojo predictions saved in {0}".format(out_pojo_csv)

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r,0]
        if equality == "numeric":
            pp = float.fromhex(predictions2[r,0])
            assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp)
        elif equality == "class":
            pp = predictions2[r,0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp)
        else:
            raise(ValueError, "equality type {0} is not supported".format(equality))
Beispiel #41
0
def javapredict(algo, equality, train, test, x, y, **kwargs):
    print "Creating model in H2O"
    if algo == "gbm":
        model = h2o.gbm(x=train[x], y=train[y], **kwargs)
    elif algo == "random_forest":
        model = h2o.random_forest(x=train[x], y=train[y], **kwargs)
    elif algo == "deeplearning":
        model = h2o.deeplearning(x=train[x], y=train[y], **kwargs)
    elif algo == "glm":
        model = h2o.glm(x=train[x], y=train[y], **kwargs)
    else:
        raise (ValueError, "algo {0} is not supported".format(algo))
    print model

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                     "results", model._id))
    os.mkdir(tmpdir)
    h2o.download_pojo(model, path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar")
    assert os.path.exists(
        h2o_genmodel_jar
    ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir, model._id + ".java")
    assert os.path.exists(
        java_file), "Expected file {0} to exist, but it does not.".format(
            java_file)
    print "java code saved in {0}".format(java_file)

    print "Predicting in H2O"
    predictions = model.predict(test)
    predictions.summary()
    predictions.head()
    out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv")
    h2o.download_csv(predictions, out_h2o_csv)
    assert os.path.exists(
        out_h2o_csv), "Expected file {0} to exist, but it does not.".format(
            out_h2o_csv)
    print "H2O Predictions saved in {0}".format(out_h2o_csv)

    print "Setting up for Java POJO"
    in_csv = os.path.join(tmpdir, "in.csv")
    h2o.download_csv(test[x], in_csv)

    # hack: the PredictCsv driver can't handle quoted strings, so remove them
    f = open(in_csv, 'r+')
    csv = f.read()
    csv = re.sub('\"', '', csv)
    f.seek(0)
    f.write(csv)
    f.truncate()
    f.close()
    assert os.path.exists(
        in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
    print "Input CSV to PredictCsv saved in {0}".format(in_csv)

    print "Compiling Java Pojo"
    javac_cmd = [
        "javac", "-cp", h2o_genmodel_jar, "-J-Xmx4g", "-J-XX:MaxPermSize=256m",
        java_file
    ]
    subprocess.check_call(javac_cmd)

    print "Running PredictCsv Java Program"
    out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv")
    cp_sep = ";" if sys.platform == "win32" else ":"
    java_cmd = [
        "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx4g",
        "-XX:MaxPermSize=256m", "-XX:ReservedCodeCacheSize=256m",
        "hex.genmodel.tools.PredictCsv", "--header", "--model", model._id,
        "--input", in_csv, "--output", out_pojo_csv
    ]
    p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
    o, e = p.communicate()
    print "Java output: {0}".format(o)
    assert os.path.exists(
        out_pojo_csv), "Expected file {0} to exist, but it does not.".format(
            out_pojo_csv)
    predictions2 = h2o.import_file(path=out_pojo_csv)
    print "Pojo predictions saved in {0}".format(out_pojo_csv)

    print "Comparing predictions between H2O and Java POJO"
    # Dimensions
    hr, hc = predictions.dim
    pr, pc = predictions2.dim
    assert hr == pr, "Exepcted the same number of rows, but got {0} and {1}".format(
        hr, pr)
    assert hc == pc, "Exepcted the same number of cols, but got {0} and {1}".format(
        hc, pc)

    # Value
    for r in range(hr):
        hp = predictions[r, 0]
        if equality == "numeric":
            pp = float.fromhex(predictions2[r, 0])
            assert abs(
                hp - pp
            ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(
                r, hp, pp)
        elif equality == "class":
            pp = predictions2[r, 0]
            assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(
                r, hp, pp)
        else:
            raise (ValueError,
                   "equality type {0} is not supported".format(equality))
Beispiel #42
0
type(_)



h2o.import_file?
h2o.parse_setup?

cluster_estimator = H2OKMeansEstimator(k=2)

cluster_estimator.train?
cluster_estimator.train(x=['size','numberofrecords'], 
                        y='FilteredFilename',
                        training_frame=h2o_df,
                        verbose=False)

h2o.download_pojo(cluster_estimator)

cluster_estimator


#h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor()
#model = h2o.glm(y = "CAPSULE",
#                x = ["AGE", "RACE", "PSA", "GLEASON"],
#                training_frame = h2o_df,
#                family = "binomial")
#h2o.download_pojo(model)

#==============================================================================
# binary logistic
#==============================================================================
Beispiel #43
0
def exportPojo():
    h2o.download_pojo(iot.dl, path="../predictions/src/main/java/")
    # Write th threshold to a properties file
    print("threshold=" + toString(threshold), file='../predictions/src/main/resources/dl.properties')
Beispiel #44
0
import h2o

h2o.init()

model = h2o.load_model("GBM_ForLoanPredict.zip")
h2o.download_pojo(model, path='./', get_jar=True)
Beispiel #45
0
def javapredict(algo, equality, train, test, x, y, compile_only=False, **kwargs):
    print("Creating model in H2O")
    if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs)
    elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs)
    elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs)
    elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs)
    elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs)
    elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs)
    elif algo == "pca": model = H2OPCA(**kwargs)
    else: raise ValueError
    if algo == "kmeans" or algo == "pca": model.train(x=x, training_frame=train)
    else: model.train(x=x, y=y, training_frame=train)
    print(model)

    # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means.
    # TODO: clients should extract Java class name from header.
    regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]")
    pojoname = regex.sub("_",model._id)

    print("Downloading Java prediction model code from H2O")
    tmpdir = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"..","results",pojoname))
    os.mkdir(tmpdir)
    h2o.download_pojo(model,path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir,"h2o-genmodel.jar")
    assert os.path.exists(h2o_genmodel_jar), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print("h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar))
    java_file = os.path.join(tmpdir,pojoname+".java")
    assert os.path.exists(java_file), "Expected file {0} to exist, but it does not.".format(java_file)
    print("java code saved in {0}".format(java_file))

    print("Compiling Java Pojo")
    javac_cmd = ["javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g", "-J-XX:MaxPermSize=256m", java_file]
    subprocess.check_call(javac_cmd)

    if not compile_only:
        print("Predicting in H2O")
        predictions = model.predict(test)
        predictions.summary()
        predictions.head()
        out_h2o_csv = os.path.join(tmpdir,"out_h2o.csv")
        h2o.download_csv(predictions, out_h2o_csv)
        assert os.path.exists(out_h2o_csv), "Expected file {0} to exist, but it does not.".format(out_h2o_csv)
        print("H2O Predictions saved in {0}".format(out_h2o_csv))

        print("Setting up for Java POJO")
        in_csv = os.path.join(tmpdir,"in.csv")
        h2o.download_csv(test[x], in_csv)

        # hack: the PredictCsv driver can't handle quoted strings, so remove them
        f = open(in_csv, 'r+')
        csv = f.read()
        csv = re.sub('\"', '', csv)
        f.seek(0)
        f.write(csv)
        f.truncate()
        f.close()
        assert os.path.exists(in_csv), "Expected file {0} to exist, but it does not.".format(in_csv)
        print("Input CSV to PredictCsv saved in {0}".format(in_csv))

        print("Running PredictCsv Java Program")
        out_pojo_csv = os.path.join(tmpdir,"out_pojo.csv")
        cp_sep = ";" if sys.platform == "win32" else ":"
        java_cmd = ["java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir, "-Xmx12g", "-XX:MaxPermSize=2g",
                    "-XX:ReservedCodeCacheSize=256m", "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname,
                    "--input", in_csv, "--output", out_pojo_csv]
        p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
        o, e = p.communicate()
        print("Java output: {0}".format(o))
        assert os.path.exists(out_pojo_csv), "Expected file {0} to exist, but it does not.".format(out_pojo_csv)
        predictions2 = h2o.upload_file(path=out_pojo_csv)
        print("Pojo predictions saved in {0}".format(out_pojo_csv))

        print("Comparing predictions between H2O and Java POJO")
        # Dimensions
        hr, hc = predictions.dim
        pr, pc = predictions2.dim
        assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(hr, pr)
        assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(hc, pc)

        # Value
        for r in range(hr):
            hp = predictions[r,0]
            if equality == "numeric":
                pp = float.fromhex(predictions2[r,0])
                assert abs(hp - pp) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(r,hp, pp)
            elif equality == "class":
                pp = predictions2[r,0]
                assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(r,hp, pp)
            else:
                raise ValueError
Beispiel #46
0
def javapredict(algo,
                equality,
                train,
                test,
                x,
                y,
                compile_only=False,
                **kwargs):
    print "Creating model in H2O"
    if algo == "gbm": model = H2OGradientBoostingEstimator(**kwargs)
    elif algo == "random_forest": model = H2ORandomForestEstimator(**kwargs)
    elif algo == "deeplearning": model = H2ODeepLearningEstimator(**kwargs)
    elif algo == "glm": model = H2OGeneralizedLinearEstimator(**kwargs)
    elif algo == "naive_bayes": model = H2ONaiveBayesEstimator(**kwargs)
    elif algo == "kmeans": model = H2OKMeansEstimator(**kwargs)
    elif algo == "pca": model = H2OPCA(**kwargs)
    else: raise (ValueError, "algo {0} is not supported".format(algo))
    if algo == "kmeans" or algo == "pca":
        model.train(x=x, training_frame=train)
    else:
        model.train(x=x, y=y, training_frame=train)
    print model

    # HACK: munge model._id so that it conforms to Java class name. For example, change K-means to K_means.
    # TODO: clients should extract Java class name from header.
    regex = re.compile("[+\\-* !@#$%^&()={}\\[\\]|;:'\"<>,.?/]")
    pojoname = regex.sub("_", model._id)

    print "Downloading Java prediction model code from H2O"
    tmpdir = os.path.normpath(
        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                     "results", pojoname))
    os.mkdir(tmpdir)
    h2o.download_pojo(model, path=tmpdir)
    h2o_genmodel_jar = os.path.join(tmpdir, "h2o-genmodel.jar")
    assert os.path.exists(
        h2o_genmodel_jar
    ), "Expected file {0} to exist, but it does not.".format(h2o_genmodel_jar)
    print "h2o-genmodel.jar saved in {0}".format(h2o_genmodel_jar)
    java_file = os.path.join(tmpdir, pojoname + ".java")
    assert os.path.exists(
        java_file), "Expected file {0} to exist, but it does not.".format(
            java_file)
    print "java code saved in {0}".format(java_file)

    print "Compiling Java Pojo"
    javac_cmd = [
        "javac", "-cp", h2o_genmodel_jar, "-J-Xmx12g",
        "-J-XX:MaxPermSize=256m", java_file
    ]
    subprocess.check_call(javac_cmd)

    if not compile_only:
        print "Predicting in H2O"
        predictions = model.predict(test)
        predictions.summary()
        predictions.head()
        out_h2o_csv = os.path.join(tmpdir, "out_h2o.csv")
        h2o.download_csv(predictions, out_h2o_csv)
        assert os.path.exists(
            out_h2o_csv
        ), "Expected file {0} to exist, but it does not.".format(out_h2o_csv)
        print "H2O Predictions saved in {0}".format(out_h2o_csv)

        print "Setting up for Java POJO"
        in_csv = os.path.join(tmpdir, "in.csv")
        h2o.download_csv(test[x], in_csv)

        # hack: the PredictCsv driver can't handle quoted strings, so remove them
        f = open(in_csv, 'r+')
        csv = f.read()
        csv = re.sub('\"', '', csv)
        f.seek(0)
        f.write(csv)
        f.truncate()
        f.close()
        assert os.path.exists(
            in_csv), "Expected file {0} to exist, but it does not.".format(
                in_csv)
        print "Input CSV to PredictCsv saved in {0}".format(in_csv)

        print "Running PredictCsv Java Program"
        out_pojo_csv = os.path.join(tmpdir, "out_pojo.csv")
        cp_sep = ";" if sys.platform == "win32" else ":"
        java_cmd = [
            "java", "-ea", "-cp", h2o_genmodel_jar + cp_sep + tmpdir,
            "-Xmx12g", "-XX:MaxPermSize=2g", "-XX:ReservedCodeCacheSize=256m",
            "hex.genmodel.tools.PredictCsv", "--header", "--model", pojoname,
            "--input", in_csv, "--output", out_pojo_csv
        ]
        p = subprocess.Popen(java_cmd, stdout=PIPE, stderr=STDOUT)
        o, e = p.communicate()
        print "Java output: {0}".format(o)
        assert os.path.exists(
            out_pojo_csv
        ), "Expected file {0} to exist, but it does not.".format(out_pojo_csv)
        predictions2 = h2o.upload_file(path=out_pojo_csv)
        print "Pojo predictions saved in {0}".format(out_pojo_csv)

        print "Comparing predictions between H2O and Java POJO"
        # Dimensions
        hr, hc = predictions.dim
        pr, pc = predictions2.dim
        assert hr == pr, "Expected the same number of rows, but got {0} and {1}".format(
            hr, pr)
        assert hc == pc, "Expected the same number of cols, but got {0} and {1}".format(
            hc, pc)

        # Value
        for r in range(hr):
            hp = predictions[r, 0]
            if equality == "numeric":
                pp = float.fromhex(predictions2[r, 0])
                assert abs(
                    hp - pp
                ) < 1e-4, "Expected predictions to be the same (within 1e-4) for row {0}, but got {1} and {2}".format(
                    r, hp, pp)
            elif equality == "class":
                pp = predictions2[r, 0]
                assert hp == pp, "Expected predictions to be the same for row {0}, but got {1} and {2}".format(
                    r, hp, pp)
            else:
                raise (ValueError,
                       "equality type {0} is not supported".format(equality))
Beispiel #47
0
import h2o

h2o.init()

df = h2o.import_file('data/titanic.csv')

df['pclass'] = df['pclass'].asfactor()
df['survived'] = df['survived'].asfactor()

model = h2o.gbm(y = 'survived',
                x = ['pclass', 'sex', 'age', 'fare'],
                training_frame = df,
                model_id = 'MyModel')

h2o.download_pojo(model, path = 'tmp')
Beispiel #48
0
# calculate metrics
binomial_fit.model_performance(test)

# remove response column inorder to test
# use threshold for max than f1
newdata = test
newdata['CAPSULE'] = None
newpred = binomial_fit.predict(newdata)
newpred

# manually define threshold for predictions to 0.3
import pandas as pd
pred = binomial_fit.predict(h2o_df)
pred['predict'] = pred['p1'] > 0.3

# POJO visualization object
h2o_df = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv")
h2o_df['CAPSULE'] = h2o_df['CAPSULE'].asfactor()
binomial_fit = H2OGeneralizedLinearEstimator(family ="binomial")
binomial_fit.train(y = "CAPSULE", x = ["AGE", "RACE","PSA", "GLEASON"], training_frame = h2o_df)
h2o.download_pojo(binomial_fit)

# Verifying model results
h2o_df = h2o.import_file("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
model = H2OGeneralizedLinearEstimator(family = "binomial", nfolds = 5)
model.train(y = "IsDepDelayed", x = ["Year", "Origin"], training_frame=h2o_df)
print("full model training auc:", model.auc())
print("full model validation auc:", model.auc(xval=True))
for model_ in model.get_xval_models():
    print(model_.model_id, "training auc:", model_.auc(), "validation auc:", model_.auc(valid=True))
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

h2o.init()
h2o_df = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/prostate/prostate.csv")
h2o_df["CAPSULE"] = h2o_df["CAPSULE"].asfactor()
binomial_fit = H2OGeneralizedLinearEstimator(family="binomial")
binomial_fit.train(y="CAPSULE", x=["AGE", "RACE", "PSA", "GLEASON"], training_frame=h2o_df)
h2o.download_pojo(binomial_fit)