def testMOJOModelReturnsSameResultAsBinaryModelWhenOffsetColumnsIsSet( hc, dataset): [trainingDataset, testingDataset] = dataset.randomSplit([0.8, 0.2], 1) trainingFrame = hc.as_h2o_frame(trainingDataset) testingFrame = hc.as_h2o_frame(testingDataset) gbm = H2OGradientBoostingEstimator(distribution="tweedie", ntrees=600, max_depth=1, min_rows=1, learn_rate=0.1, min_split_improvement=0) gbm.train(x=["District", "Group", "Age"], y="Claims", training_frame=trainingFrame, offset_column="Offset") mojoFile = gbm.download_mojo(path=os.path.abspath("build/"), get_genmodel_jar=False) print(mojoFile) mojoModel = H2OMOJOModel.createFromMojo("file://" + mojoFile) binaryModelResult = hc.as_spark_frame(gbm.predict(testingFrame)) mojoResult = mojoModel.transform(testingDataset).select("prediction") unit_test_utils.assert_data_frames_are_identical(binaryModelResult, mojoResult) assert mojoModel.getOffsetCol( ) == "Offset", "Offset column must be propagated to the MOJO model."
def testTargetEncoderModelProduceSameResultsRegardlessSpecificationOfOutputCols( trainingDataset, testingDataset): def trainAndReturnTranformedTestingDataset(targetEncoder): targetEncoderModel = targetEncoder.fit(trainingDataset) return targetEncoderModel.transformTrainingDataset(testingDataset) targetEncoderDefaultOutputCols = H2OTargetEncoder() \ .setInputCols(["RACE", "DPROS", "DCAPS"]) \ .setLabelCol("CAPSULE") \ .setHoldoutStrategy("None") \ .setNoise(0.0) dataFrameDefaultOutputCols = trainAndReturnTranformedTestingDataset(targetEncoderDefaultOutputCols) \ .withColumnRenamed("RACE_te", "RACE_out") \ .withColumnRenamed("DPROS_te", "DPROS_out") \ .withColumnRenamed("DCAPS_te", "DCAPS_out") targetEncoderCustomOutputCols = H2OTargetEncoder() \ .setInputCols(["RACE", "DPROS", "DCAPS"]) \ .setOutputCols(["RACE_out", "DPROS_out", "DCAPS_out"]) \ .setLabelCol("CAPSULE") \ .setHoldoutStrategy("None") \ .setNoise(0.0) dataFrameCustomOutputCols = trainAndReturnTranformedTestingDataset( targetEncoderCustomOutputCols) unit_test_utils.assert_data_frames_are_identical( dataFrameDefaultOutputCols, dataFrameCustomOutputCols)
def testPipelineSerialization(craiglistDataset): [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1], 42) tokenizer = RegexTokenizer(inputCol="jobtitle", minTokenLength=2, outputCol="tokenized") stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="stopWordsRemoved") w2v = H2OWord2Vec(sentSampleRate=0, epochs=10, inputCol=stopWordsRemover.getOutputCol(), outputCol="w2v") gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()]) pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/w2v_pipeline")) model = loadedPipeline.fit(traningDataset) expected = model.transform(testingDataset) model.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/w2v_pipeline_model")) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def testH2OAutoMLRegressorBehavesTheSameAsGenericH2OAutoMLOnNumericLabelColumn(prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) automl = setParametersForTesting(H2OAutoML()) referenceModel = automl.fit(trainingDateset) referenceDataset = referenceModel.transform(testingDataset) classifier = setParametersForTesting(H2OAutoMLRegressor()) model = classifier.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
def testH2OAutoMLClassifierBehavesTheSameAsGenericH2OAutoMLOnStringLabelColumn(prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) automl = setParametersForTesting(H2OAutoML()) referenceModel = automl.fit(trainingDateset.withColumn("CAPSULE", col("CAPSULE").cast("string"))) referenceDataset = referenceModel.transform(testingDataset) classifier = setParametersForTesting(H2OAutoMLClassifier()) model = classifier.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
def testTargetEncoderMOJOModelCouldBeSavedAndLoaded(trainingDataset, testingDataset): targetEncoder = H2OTargetEncoder(foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"], outputCols=["RACE_out", "DPROS_out", "DCAPS_out"]) model = targetEncoder.fit(trainingDataset) path = "file://" + os.path.abspath("build/testTargetEncoderMOJOModelCouldBeSavedAndLoaded") model.write().overwrite().save(path) loadedModel = H2OTargetEncoderMOJOModel.load(path) expected = model.transform(testingDataset) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def testTargetEncoderModelWithDisabledNoiseAndTargetEncoderMOJOModelTransformTheTrainingDatasetSameWay(trainingDataset): targetEncoder = H2OTargetEncoder() \ .setInputCols([["RACE"], ["DPROS", "DCAPS"]]) \ .setLabelCol("CAPSULE") \ .setHoldoutStrategy("None") \ .setNoise(0.0) targetEncoderModel = targetEncoder.fit(trainingDataset) transformedByModel = targetEncoderModel.transformTrainingDataset(trainingDataset) transformedByMOJOModel = targetEncoderModel.transform(trainingDataset) unit_test_utils.assert_data_frames_are_identical(transformedByModel, transformedByMOJOModel)
def testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult(trainingDataset, testingDataset): targetEncoder = H2OTargetEncoder(labelCol="CAPSULE", inputCols=[["RACE"], ["DPROS", "DCAPS"]]) pipeline = Pipeline(stages=[targetEncoder]) producedModel = pipeline.fit(trainingDataset) path = "file://" + os.path.abspath("build/testProducedMOJOModelAndLoadedMOJOModelReturnsSameResult") producedModel.write().overwrite().save(path) loadedModel = PipelineModel.load(path) transformedByProducedModel = producedModel.transform(testingDataset) transformedByLoadedModel = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(transformedByProducedModel, transformedByLoadedModel)
def testH2OFrameOfSpecificTypeToDataframe(spark, hc, data, sparkType): columnName = 'A' schema = StructType([ StructField(columnName, sparkType, False), ]) originalDF = spark.createDataFrame(map(lambda i: (i,), data), schema) frame = h2o.H2OFrame(data, column_names=[columnName]) transformedDF = hc.asSparkFrame(frame) unit_test_utils.assert_data_frames_are_identical(originalDF, transformedDF) assert originalDF.dtypes == transformedDF.dtypes
def testGridSearchWithDRFRegressorBehavesTheSameAsGridSearchWithGenericDRFOnNumericLabelColumn( prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) referenceGrid = createGridForProblemSpecificTesting(H2ODRF()) referenceModel = referenceGrid.fit(trainingDateset) referenceDataset = referenceModel.transform(testingDataset) grid = createGridForProblemSpecificTesting(H2ODRFRegressor()) model = grid.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
def testLoadAndTrainMojo(hc, spark): referenceMojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) df = spark.read.csv("file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) frame = hc.asH2OFrame(df) frame["CAPSULE"] = frame["CAPSULE"].asfactor() gbm = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=2, seed=42) gbm.train(y="CAPSULE", training_frame=frame) mojoFile = gbm.download_mojo(path=os.path.abspath("build/"), get_genmodel_jar=False) trainedMojo = H2OMOJOModel.createFromMojo("file://" + mojoFile) expect = referenceMojo.transform(df) result = trainedMojo.transform(df) unit_test_utils.assert_data_frames_are_identical(expect, result)
def testPipelineSerialization(prostateDataset): algo = H2OIsolationForest(seed=1) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/isolation_forest_pipeline")) model = loadedPipeline.fit(prostateDataset) expected = model.transform(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/isolation_forest_pipeline_model")) result = loadedModel.transform(prostateDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def testGridSearchWithDRFClassifierBehavesTheSameAsGridSearchWithGenericDRFOnStringLabelColumn( prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) referenceGrid = createGridForProblemSpecificTesting(H2ODRF()) referenceModel = referenceGrid.fit( trainingDateset.withColumn("CAPSULE", col("CAPSULE").cast("string"))) referenceDataset = referenceModel.transform(testingDataset) grid = createGridForProblemSpecificTesting(H2ODRFClassifier()) model = grid.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
def testPipelineSerialization(heartDataset): features = ['age', 'year', 'surgery', 'transplant', 'start', 'stop'] algo = H2OCoxPH(labelCol="event", featuresCols=features, startCol='start', stopCol='stop') pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/cox_ph_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/cox_ph_pipeline")) model = loadedPipeline.fit(heartDataset) expected = model.transform(heartDataset) model.write().overwrite().save("file://" + os.path.abspath("build/cox_ph_pipeline")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/cox_ph_pipeline")) result = loadedModel.transform(heartDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def _fit(self, dataset): unit_test_utils.assert_data_frames_are_identical(expected, dataset) unit_test_utils.assert_data_frames_have_different_values(unexpected, dataset) return DummyTransformer()