def insuranceFrame(hc, spark): df = spark \ .read.csv("file://" + unit_test_utils.locate("smalldata/insurance.csv"), header=True, inferSchema=True) \ .withColumn("Offset", log(col("Holders"))) frame = hc.asH2OFrame(df) frame["Group"] = frame["Group"].asfactor() frame["Age"] = frame["Age"].asfactor() return frame
def testLoadAndTrainMojo(hc, spark): referenceMojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) df = spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) frame = hc.asH2OFrame(df) frame["CAPSULE"] = frame["CAPSULE"].asfactor() gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=2, seed=42) gbm.train(y="CAPSULE", training_frame=frame) mojoFile = gbm.download_mojo(path=os.path.abspath("build/"), get_genmodel_jar=False) trainedMojo = H2OMOJOModel.createFromMojo("file://" + mojoFile) expect = referenceMojo.transform(df) result = trainedMojo.transform(df) unit_test_utils.assert_data_frames_are_identical(expect, result)
def testCustomMetric(): trainPath = "file://" + unit_test_utils.locate("smalldata/loan.csv") train = h2o.import_file(trainPath, destination_frame="loan_train") train["bad_loan"] = train["bad_loan"].asfactor() y = "bad_loan" x = train.col_names x.remove(y) x.remove("int_rate") train["weight"] = train["loan_amnt"] weightedFalseNegativeLossFunc = h2o.upload_custom_metric(WeightedFalseNegativeLossMetric, func_name="WeightedFalseNegativeLoss", func_file="weighted_false_negative_loss.py") gbm = H2OGradientBoostingEstimator(model_id="gbm.hex", custom_metric_func=weightedFalseNegativeLossFunc) gbm.train(y=y, x=x, training_frame=train, weights_column="weight") perf = gbm.model_performance() assert perf.custom_metric_name() == "WeightedFalseNegativeLoss" assert perf.custom_metric_value() == 0.24579011595430142
def dataset(spark): return spark.read.csv( "file://" + unit_test_utils.locate("smalldata/iris/iris_wheader.csv"), header=True, inferSchema=True)
def dataset(spark): return spark \ .read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) \ .withColumn("CAPSULE", col("CAPSULE").cast("string"))
def prostateDataset(spark): return spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True)
def dataset(spark): return spark \ .read.csv("file://" + unit_test_utils.locate("smalldata/insurance.csv"), header=True, inferSchema=True) \ .withColumn("Offset", log(col("Holders")))