def test_glm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGLM(featuresCols=[
            "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ],
                      labelCol="AGE",
                      seed=1,
                      ratio=0.8)

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline"))
        loaded_pipeline = Pipeline.load("file://" +
                                        os.path.abspath("build/glm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/glm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()
Exemple #2
0
def createInitialGlmDefinitionForRandomCols():
    return H2OGLM(featuresCols=["x1", "x3", "x5", "x6"],
                  labelCol="y",
                  family="gaussian",
                  randomFamily=["gaussian"],
                  randomLink=["identity"],
                  HGLM=True,
                  calcLike=True)
Exemple #3
0
 def createInitialGlmDefinition():
     featuresCols = [
         "economy", "displacement", "power", "weight", "acceleration",
         "year", "economy_20mpg"
     ]
     return H2OGLM(featuresCols=featuresCols,
                   labelCol="cylinders",
                   seed=1,
                   splitRatio=0.8)
Exemple #4
0
 def createInitialGlmDefinition():
     return H2OGLM(seed=42,
                   family="binomial",
                   lambdaSearch=True,
                   featuresCols=[
                       "Year", "Month", "DayofMonth", "DayOfWeek",
                       "CRSDepTime", "CRSArrTime", "UniqueCarrier",
                       "CRSElapsedTime", "Origin", "Dest", "Distance"
                   ],
                   labelCol="IsDepDelayed")
Exemple #5
0
def testPropagationOfPredictionCol(prostateDataset):
    predictionCol = "my_prediction_col_name"
    algo = H2OGLM(featuresCols=[
        "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
    ],
                  labelCol="AGE",
                  seed=1,
                  splitRatio=0.8,
                  predictionCol=predictionCol)

    model = algo.fit(prostateDataset)
    columns = model.transform(prostateDataset).columns
    assert True == (predictionCol in columns)
Exemple #6
0
def testH2OGLMRegressorBehavesTheSameAsGenericH2OGLMOnNumericLabelColumn(
        prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                    42)

    automl = setParamtersForProblemSpecificTests(H2OGLM())
    referenceModel = automl.fit(trainingDateset)
    referenceDataset = referenceModel.transform(testingDataset)

    classifier = setParamtersForProblemSpecificTests(H2OGLMRegressor())
    model = classifier.fit(trainingDateset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
Exemple #7
0
def testInteractionColumnNamesArePassedWithoutException(spark):
    data = [(0.0, "a", 2.0), (float("nan"), "b", 8.0), (0.0, "a", 4.0),
            (1.0, "b", 1.0)]
    df = spark.createDataFrame(data, ["x", "y", "z"])

    plugValues = {"x": 0, "x_y.a": 1, "x_y.b": 2, "y": "b"}
    glm = H2OGLM(labelCol="z",
                 seed=42,
                 ignoreConstCols=False,
                 standardize=False,
                 family="gaussian",
                 missingValuesHandling="PlugValues",
                 plugValues=plugValues)

    glm.fit(df)
Exemple #8
0
def testH2OGLMClassifierBehavesTheSameAsGenericH2OGLMOnStringLabelColumn(
        prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                    42)

    glm = setParamtersForProblemSpecificTests(H2OGLM())
    referenceModel = glm.fit(
        trainingDateset.withColumn("CAPSULE",
                                   col("CAPSULE").cast("string")))
    referenceDataset = referenceModel.transform(testingDataset)

    classifier = setParamtersForProblemSpecificTests(H2OGLMClassifier())
    model = classifier.fit(trainingDateset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)
    def test_propagation_of_prediction_col(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        predictionCol = "my_prediction_col_name"
        algo = H2OGLM(featuresCols=[
            "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ],
                      labelCol="AGE",
                      seed=1,
                      splitRatio=0.8,
                      predictionCol=predictionCol)

        model = algo.fit(prostate_frame)
        columns = model.transform(prostate_frame).columns
        self.assertEquals(True, predictionCol in columns)
Exemple #10
0
def testPipelineSerialization(prostateDataset):
    algo = H2OGLM(featuresCols=[
        "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
    ],
                  labelCol="AGE",
                  seed=1,
                  splitRatio=0.8)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/glm_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/glm_pipeline"))
    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/glm_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/glm_pipeline_model"))

    loadedModel.transform(prostateDataset).count()
def testPipelineSerializationGLM(prostateDataset):
    gridSearchTester(H2OGLM().setLabelCol("AGE"), prostateDataset)
Exemple #12
0
 def createInitialGlmDefinition():
     return H2OGLM(featuresCols=featuresCols,
                   labelCol="CAPSULE",
                   seed=1,
                   splitRatio=0.8)