Python H2OGLM Exemples, pysparkling.ml.algos.H2OGLM Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : tests_unit_conversions.py Projet : evergage/sparkling-water

    def test_glm_in_spark_pipeline(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        algo = H2OGLM(featuresCols=[
            "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ],
                      labelCol="AGE",
                      seed=1,
                      ratio=0.8)

        pipeline = Pipeline(stages=[algo])
        pipeline.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline"))
        loaded_pipeline = Pipeline.load("file://" +
                                        os.path.abspath("build/glm_pipeline"))
        model = loaded_pipeline.fit(prostate_frame)

        model.write().overwrite().save(
            "file://" + os.path.abspath("build/glm_pipeline_model"))
        loaded_model = PipelineModel.load(
            "file://" + os.path.abspath("build/glm_pipeline_model"))

        loaded_model.transform(prostate_frame).count()

Exemple #2

0

Afficher le fichier

def createInitialGlmDefinitionForRandomCols():
    return H2OGLM(featuresCols=["x1", "x3", "x5", "x6"],
                  labelCol="y",
                  family="gaussian",
                  randomFamily=["gaussian"],
                  randomLink=["identity"],
                  HGLM=True,
                  calcLike=True)

Exemple #3

0

Afficher le fichier

 def createInitialGlmDefinition():
     featuresCols = [
         "economy", "displacement", "power", "weight", "acceleration",
         "year", "economy_20mpg"
     ]
     return H2OGLM(featuresCols=featuresCols,
                   labelCol="cylinders",
                   seed=1,
                   splitRatio=0.8)

Exemple #4

0

Afficher le fichier

 def createInitialGlmDefinition():
     return H2OGLM(seed=42,
                   family="binomial",
                   lambdaSearch=True,
                   featuresCols=[
                       "Year", "Month", "DayofMonth", "DayOfWeek",
                       "CRSDepTime", "CRSArrTime", "UniqueCarrier",
                       "CRSElapsedTime", "Origin", "Dest", "Distance"
                   ],
                   labelCol="IsDepDelayed")

Exemple #5

0

Afficher le fichier

def testPropagationOfPredictionCol(prostateDataset):
    predictionCol = "my_prediction_col_name"
    algo = H2OGLM(featuresCols=[
        "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
    ],
                  labelCol="AGE",
                  seed=1,
                  splitRatio=0.8,
                  predictionCol=predictionCol)

    model = algo.fit(prostateDataset)
    columns = model.transform(prostateDataset).columns
    assert True == (predictionCol in columns)

Exemple #6

0

Afficher le fichier

def testH2OGLMRegressorBehavesTheSameAsGenericH2OGLMOnNumericLabelColumn(
        prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                    42)

    automl = setParamtersForProblemSpecificTests(H2OGLM())
    referenceModel = automl.fit(trainingDateset)
    referenceDataset = referenceModel.transform(testingDataset)

    classifier = setParamtersForProblemSpecificTests(H2OGLMRegressor())
    model = classifier.fit(trainingDateset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)

Exemple #7

0

Afficher le fichier

def testInteractionColumnNamesArePassedWithoutException(spark):
    data = [(0.0, "a", 2.0), (float("nan"), "b", 8.0), (0.0, "a", 4.0),
            (1.0, "b", 1.0)]
    df = spark.createDataFrame(data, ["x", "y", "z"])

    plugValues = {"x": 0, "x_y.a": 1, "x_y.b": 2, "y": "b"}
    glm = H2OGLM(labelCol="z",
                 seed=42,
                 ignoreConstCols=False,
                 standardize=False,
                 family="gaussian",
                 missingValuesHandling="PlugValues",
                 plugValues=plugValues)

    glm.fit(df)

Exemple #8

0

Afficher le fichier

def testH2OGLMClassifierBehavesTheSameAsGenericH2OGLMOnStringLabelColumn(
        prostateDataset):
    [trainingDateset, testingDataset] = prostateDataset.randomSplit([0.9, 0.1],
                                                                    42)

    glm = setParamtersForProblemSpecificTests(H2OGLM())
    referenceModel = glm.fit(
        trainingDateset.withColumn("CAPSULE",
                                   col("CAPSULE").cast("string")))
    referenceDataset = referenceModel.transform(testingDataset)

    classifier = setParamtersForProblemSpecificTests(H2OGLMClassifier())
    model = classifier.fit(trainingDateset)
    result = model.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(referenceDataset, result)

Exemple #9

0

Afficher le fichier

Fichier : tests_unit_conversions.py Projet : vikasinfrrd/sparkling-water

    def test_propagation_of_prediction_col(self):
        prostate_frame = self._spark.read.csv(
            "file://" +
            unit_test_utils.locate("smalldata/prostate/prostate.csv"),
            header=True,
            inferSchema=True)

        predictionCol = "my_prediction_col_name"
        algo = H2OGLM(featuresCols=[
            "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
        ],
                      labelCol="AGE",
                      seed=1,
                      splitRatio=0.8,
                      predictionCol=predictionCol)

        model = algo.fit(prostate_frame)
        columns = model.transform(prostate_frame).columns
        self.assertEquals(True, predictionCol in columns)

Exemple #10

0

Afficher le fichier

def testPipelineSerialization(prostateDataset):
    algo = H2OGLM(featuresCols=[
        "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"
    ],
                  labelCol="AGE",
                  seed=1,
                  splitRatio=0.8)

    pipeline = Pipeline(stages=[algo])
    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/glm_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/glm_pipeline"))
    model = loadedPipeline.fit(prostateDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/glm_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/glm_pipeline_model"))

    loadedModel.transform(prostateDataset).count()

Exemple #11

0

Afficher le fichier

Fichier : test_gridsearch.py Projet : openhadoop/sparkling-water

def testPipelineSerializationGLM(prostateDataset):
    gridSearchTester(H2OGLM().setLabelCol("AGE"), prostateDataset)

Exemple #12

0

Afficher le fichier

 def createInitialGlmDefinition():
     return H2OGLM(featuresCols=featuresCols,
                   labelCol="CAPSULE",
                   seed=1,
                   splitRatio=0.8)