def test_glm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGLM(featuresCols=[ "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ], labelCol="AGE", seed=1, ratio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/glm_pipeline")) loaded_pipeline = Pipeline.load("file://" + os.path.abspath("build/glm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/glm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/glm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def createInitialGlmDefinitionForRandomCols(): return H2OGLM(featuresCols=["x1", "x3", "x5", "x6"], labelCol="y", family="gaussian", randomFamily=["gaussian"], randomLink=["identity"], HGLM=True, calcLike=True)
def createInitialGlmDefinition(): featuresCols = [ "economy", "displacement", "power", "weight", "acceleration", "year", "economy_20mpg" ] return H2OGLM(featuresCols=featuresCols, labelCol="cylinders", seed=1, splitRatio=0.8)
def createInitialGlmDefinition(): return H2OGLM(seed=42, family="binomial", lambdaSearch=True, featuresCols=[ "Year", "Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "CRSElapsedTime", "Origin", "Dest", "Distance" ], labelCol="IsDepDelayed")
def testPropagationOfPredictionCol(prostateDataset): predictionCol = "my_prediction_col_name" algo = H2OGLM(featuresCols=[ "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ], labelCol="AGE", seed=1, splitRatio=0.8, predictionCol=predictionCol) model = algo.fit(prostateDataset) columns = model.transform(prostateDataset).columns assert True == (predictionCol in columns)
def testH2OGLMRegressorBehavesTheSameAsGenericH2OGLMOnNumericLabelColumn( prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) automl = setParamtersForProblemSpecificTests(H2OGLM()) referenceModel = automl.fit(trainingDateset) referenceDataset = referenceModel.transform(testingDataset) classifier = setParamtersForProblemSpecificTests(H2OGLMRegressor()) model = classifier.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
def testInteractionColumnNamesArePassedWithoutException(spark): data = [(0.0, "a", 2.0), (float("nan"), "b", 8.0), (0.0, "a", 4.0), (1.0, "b", 1.0)] df = spark.createDataFrame(data, ["x", "y", "z"]) plugValues = {"x": 0, "x_y.a": 1, "x_y.b": 2, "y": "b"} glm = H2OGLM(labelCol="z", seed=42, ignoreConstCols=False, standardize=False, family="gaussian", missingValuesHandling="PlugValues", plugValues=plugValues) glm.fit(df)
def testH2OGLMClassifierBehavesTheSameAsGenericH2OGLMOnStringLabelColumn( prostateDataset): [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1], 42) glm = setParamtersForProblemSpecificTests(H2OGLM()) referenceModel = glm.fit( trainingDateset.withColumn("CAPSULE", col("CAPSULE").cast("string"))) referenceDataset = referenceModel.transform(testingDataset) classifier = setParamtersForProblemSpecificTests(H2OGLMClassifier()) model = classifier.fit(trainingDateset) result = model.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
def test_propagation_of_prediction_col(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) predictionCol = "my_prediction_col_name" algo = H2OGLM(featuresCols=[ "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ], labelCol="AGE", seed=1, splitRatio=0.8, predictionCol=predictionCol) model = algo.fit(prostate_frame) columns = model.transform(prostate_frame).columns self.assertEquals(True, predictionCol in columns)
def testPipelineSerialization(prostateDataset): algo = H2OGLM(featuresCols=[ "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ], labelCol="AGE", seed=1, splitRatio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/glm_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/glm_pipeline")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/glm_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/glm_pipeline_model")) loadedModel.transform(prostateDataset).count()
def testPipelineSerializationGLM(prostateDataset): gridSearchTester(H2OGLM().setLabelCol("AGE"), prostateDataset)
def createInitialGlmDefinition(): return H2OGLM(featuresCols=featuresCols, labelCol="CAPSULE", seed=1, splitRatio=0.8)