Example #1
0
 def __init__(self, keep=False, columns=[]):
     super(ColumnPruner, self).__init__()
     self._java_obj = self._new_java_obj(
         "py_sparkling.ml.features.ColumnPruner", self.uid)
     self._setDefault(keep=False, columns=[])
     kwargs = get_input_kwargs(self)
     self.setParams(**kwargs)
Example #2
0
    def __init__(self,
                 foldCol=None,
                 labelCol="label",
                 inputCols=[],
                 holdoutStrategy="None",
                 blendedAvgEnabled=False,
                 blendedAvgInflectionPoint=10.0,
                 blendedAvgSmoothing=20.0,
                 noise=0.01,
                 noiseSeed=-1):
        super(H2OTargetEncoder, self).__init__()
        self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                          verbose=False)
        self._java_obj = self._new_java_obj(
            "ai.h2o.sparkling.ml.features.H2OTargetEncoder", self.uid)

        self._setDefault(foldCol=None,
                         labelCol="label",
                         inputCols=[],
                         holdoutStrategy="None",
                         blendedAvgEnabled=False,
                         blendedAvgInflectionPoint=10.0,
                         blendedAvgSmoothing=20.0,
                         noise=0.01,
                         noiseSeed=-1)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #3
0
    def __init__(self,
                 labelCol="label",
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 ratio=1.0,
                 foldCol=None,
                 weightCol=None,
                 ignoredCols=[],
                 includeAlgos=None,
                 excludeAlgos=None,
                 projectName=None,
                 maxRuntimeSecs=3600.0,
                 stoppingRounds=3,
                 stoppingTolerance=0.001,
                 stoppingMetric="AUTO",
                 nfolds=5,
                 convertUnknownCategoricalLevelsToNa=False,
                 seed=-1,
                 sortMetric="AUTO",
                 balanceClasses=False,
                 classSamplingFactors=None,
                 maxAfterBalanceSize=5.0,
                 keepCrossValidationPredictions=True,
                 keepCrossValidationModels=True,
                 maxModels=0,
                 **deprecatedArgs):
        super(H2OAutoML, self).__init__()
        self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                          verbose=False)
        self._java_obj = self._new_java_obj("py_sparkling.ml.algos.H2OAutoML",
                                            self.uid)

        self._setDefault(labelCol="label",
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         ratio=1.0,
                         foldCol=None,
                         weightCol=None,
                         ignoredCols=[],
                         includeAlgos=None,
                         excludeAlgos=None,
                         projectName=None,
                         maxRuntimeSecs=3600.0,
                         stoppingRounds=3,
                         stoppingTolerance=0.001,
                         stoppingMetric=self._hc._jvm.hex.ScoreKeeper.
                         StoppingMetric.valueOf("AUTO"),
                         nfolds=5,
                         convertUnknownCategoricalLevelsToNa=False,
                         seed=-1,
                         sortMetric=None,
                         balanceClasses=False,
                         classSamplingFactors=None,
                         maxAfterBalanceSize=5.0,
                         keepCrossValidationPredictions=True,
                         keepCrossValidationModels=True,
                         maxModels=0)
        kwargs = get_input_kwargs(self)

        self.setParams(**kwargs)
Example #4
0
    def setParams(self,
                  modelId=None,
                  splitRatio=1.0,
                  labelCol="label",
                  weightCol=None,
                  featuresCols=[],
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  nfolds=0,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  seed=-1,
                  distribution="AUTO",
                  epochs=10.0,
                  l1=0.0,
                  l2=0.0,
                  hidden=[200, 200],
                  reproducible=False,
                  convertUnknownCategoricalLevelsToNa=False,
                  foldCol=None,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        if "distribution" in kwargs:
            kwargs[
                "distribution"] = self._hc._jvm.hex.genmodel.utils.DistributionFamily.valueOf(
                    kwargs["distribution"])

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = ["splitRatio", "epochs", "l1", "l2"]
        set_double_values(kwargs, double_types)

        return self._set(**kwargs)
Example #5
0
    def setParams(self,
                  featuresCols=[],
                  labelCol="label",
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  splitRatio=1.0,
                  foldCol=None,
                  weightCol=None,
                  ignoredCols=[],
                  includeAlgos=None,
                  excludeAlgos=None,
                  projectName=None,
                  maxRuntimeSecs=3600.0,
                  stoppingRounds=3,
                  stoppingTolerance=0.001,
                  stoppingMetric="AUTO",
                  nfolds=5,
                  convertUnknownCategoricalLevelsToNa=True,
                  seed=-1,
                  sortMetric="AUTO",
                  balanceClasses=False,
                  classSamplingFactors=None,
                  maxAfterBalanceSize=5.0,
                  keepCrossValidationPredictions=True,
                  keepCrossValidationModels=True,
                  maxModels=0,
                  **deprecatedArgs):

        kwargs = get_input_kwargs(self)

        if "stoppingMetric" in kwargs:
            kwargs[
                "stoppingMetric"] = self._hc._jvm.hex.ScoreKeeper.StoppingMetric.valueOf(
                    kwargs["stoppingMetric"])

        if "projectName" in kwargs and kwargs["projectName"] is None:
            kwargs["projectName"] = ''.join(
                random.choice(string.ascii_letters) for i in range(30))

        if "excludeAlgos" in kwargs:
            jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                         verbose=False)._jvm
            kwargs["excludeAlgos"] = get_enum_array_from_str_array(
                kwargs["excludeAlgos"], jvm.ai.h2o.automl.Algo)

        if "includeAlgos" in kwargs:
            jvm = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                         verbose=False)._jvm
            kwargs["includeAlgos"] = get_enum_array_from_str_array(
                kwargs["includeAlgos"], jvm.ai.h2o.automl.Algo)

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "maxRuntimeSecs", "stoppingTolerance", "splitRatio",
            "maxAfterBalanceSize"
        ]
        set_double_values(kwargs, double_types)
        return self._set(**kwargs)
Example #6
0
    def setParams(self,
                  featuresCols=[],
                  labelCol="label",
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  splitRatio=1.0,
                  foldCol=None,
                  weightCol=None,
                  ignoredCols=[],
                  includeAlgos=None,
                  excludeAlgos=None,
                  projectName=None,
                  maxRuntimeSecs=3600.0,
                  stoppingRounds=3,
                  stoppingTolerance=0.001,
                  stoppingMetric="AUTO",
                  nfolds=5,
                  convertUnknownCategoricalLevelsToNa=True,
                  seed=-1,
                  sortMetric="AUTO",
                  balanceClasses=False,
                  classSamplingFactors=None,
                  maxAfterBalanceSize=5.0,
                  keepCrossValidationPredictions=True,
                  keepCrossValidationModels=True,
                  maxModels=0,
                  predictionCol="prediction",
                  detailedPredictionCol="detailed_prediction",
                  withDetailedPredictionCol=False,
                  convertInvalidNumbersToNa=False,
                  **deprecatedArgs):

        kwargs = get_input_kwargs(self)

        validateEnumValues(self._H2OAutoMLParams__getAutomlAlgoEnum(),
                           kwargs,
                           "includeAlgos",
                           nullEnabled=True)
        validateEnumValues(self._H2OAutoMLParams__getAutomlAlgoEnum(),
                           kwargs,
                           "excludeAlgos",
                           nullEnabled=True)
        validateEnumValue(self._H2OAutoMLParams__getStoppingMetricEnum(),
                          kwargs, "stoppingMetric")

        if "projectName" in kwargs and kwargs["projectName"] is None:
            kwargs["projectName"] = ''.join(
                random.choice(string.ascii_letters) for i in range(30))

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "maxRuntimeSecs", "stoppingTolerance", "splitRatio",
            "maxAfterBalanceSize"
        ]
        set_double_values(kwargs, double_types)
        return self._set(**kwargs)
Example #7
0
    def __init__(self,
                 modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 epochs=10.0,
                 l1=0.0,
                 l2=0.0,
                 hidden=[200, 200],
                 reproducible=False,
                 convertUnknownCategoricalLevelsToNa=False,
                 foldCol=None,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False,
                 **deprecatedArgs):
        super(H2ODeepLearning, self).__init__()
        self._java_obj = self._new_java_obj(
            "py_sparkling.ml.algos.H2ODeepLearning", self.uid)

        self._setDefault(modelId=None,
                         splitRatio=1.0,
                         labelCol="label",
                         weightCol=None,
                         featuresCols=[],
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         nfolds=0,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         seed=-1,
                         distribution="AUTO",
                         epochs=10.0,
                         l1=0.0,
                         l2=0.0,
                         hidden=[200, 200],
                         reproducible=False,
                         convertUnknownCategoricalLevelsToNa=False,
                         foldCol=None,
                         predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #8
0
    def __init__(self,
                 featuresCols=[],
                 algo=None,
                 splitRatio=1.0,
                 hyperParameters={},
                 labelCol="label",
                 weightCol=None,
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 strategy="Cartesian",
                 maxRuntimeSecs=0.0,
                 maxModels=0,
                 seed=-1,
                 stoppingRounds=0,
                 stoppingTolerance=0.001,
                 stoppingMetric="AUTO",
                 nfolds=0,
                 selectBestModelBy="AUTO",
                 selectBestModelDecreasing=True,
                 foldCol=None,
                 convertUnknownCategoricalLevelsToNa=True,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False,
                 **deprecatedArgs):
        super(H2OGridSearch, self).__init__()
        self._java_obj = self._new_java_obj(
            "py_sparkling.ml.algos.H2OGridSearch", self.uid)

        self._setDefault(featuresCols=[],
                         algo=None,
                         splitRatio=1.0,
                         hyperParameters={},
                         labelCol="label",
                         weightCol=None,
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         strategy="Cartesian",
                         maxRuntimeSecs=0.0,
                         maxModels=0,
                         seed=-1,
                         stoppingRounds=0,
                         stoppingTolerance=0.001,
                         stoppingMetric="AUTO",
                         nfolds=0,
                         selectBestModelBy="AUTO",
                         selectBestModelDecreasing=True,
                         foldCol=None,
                         convertUnknownCategoricalLevelsToNa=True,
                         predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #9
0
    def __init__(self,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 featuresCols=[],
                 foldCol=None,
                 weightCol=None,
                 splitRatio=1.0,
                 seed=-1,
                 nfolds=0,
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 convertUnknownCategoricalLevelsToNa=False,
                 convertInvalidNumbersToNa=False,
                 modelId=None,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 distribution="AUTO",
                 maxIterations=10,
                 standardize=True,
                 init="Furthest",
                 userPoints=None,
                 estimateK=False,
                 k=2,
                 **deprecatedArgs):
        super(H2OKMeans, self).__init__()
        self._java_obj = self._new_java_obj(
            "ai.h2o.sparkling.ml.algos.H2OKMeans", self.uid)

        self._setDefault(predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         featuresCols=[],
                         foldCol=None,
                         weightCol=None,
                         splitRatio=1.0,
                         seed=-1,
                         nfolds=0,
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         convertUnknownCategoricalLevelsToNa=False,
                         convertInvalidNumbersToNa=False,
                         modelId=None,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         distribution="AUTO",
                         maxIterations=10,
                         standardize=True,
                         init="Furthest",
                         userPoints=None,
                         estimateK=False,
                         k=2)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #10
0
    def setParams(self,
                  featuresCols=[],
                  algo=None,
                  splitRatio=1.0,
                  hyperParameters={},
                  labelCol="label",
                  weightCol=None,
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  strategy="Cartesian",
                  maxRuntimeSecs=0.0,
                  maxModels=0,
                  seed=-1,
                  stoppingRounds=0,
                  stoppingTolerance=0.001,
                  stoppingMetric="AUTO",
                  nfolds=0,
                  selectBestModelBy=None,
                  selectBestModelDecreasing=True,
                  foldCol=None,
                  convertUnknownCategoricalLevelsToNa=True,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        if "stoppingMetric" in kwargs:
            kwargs[
                "stoppingMetric"] = self._hc._jvm.hex.ScoreKeeper.StoppingMetric.valueOf(
                    kwargs["stoppingMetric"])

        if "strategy" in kwargs:
            kwargs[
                "strategy"] = self._hc._jvm.hex.grid.HyperSpaceSearchCriteria.Strategy.valueOf(
                    kwargs["strategy"])

        if "selectBestModelBy" in kwargs and kwargs[
                "selectBestModelBy"] is not None:
            kwargs[
                "selectBestModelBy"] = self._hc._jvm.org.apache.spark.ml.h2o.algos.H2OGridSearchMetric.valueOf(
                    kwargs["selectBestModelBy"])

        propagate_value_from_deprecated_property(kwargs, "predictionCol",
                                                 "labelCol")
        propagate_value_from_deprecated_property(kwargs, "ratio", "splitRatio")

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = ["ratio", "stoppingTolerance", "maxRuntimeSecs"]
        set_double_values(kwargs, double_types)
        if "algo" in kwargs and kwargs["algo"] is not None:
            tmp = kwargs["algo"]
            del kwargs['algo']
            self._java_obj.setAlgo(tmp._java_obj)

        return self._set(**kwargs)
Example #11
0
    def __init__(self,
                 featuresCols=[],
                 algo=None,
                 splitRatio=1.0,
                 hyperParameters={},
                 labelCol="label",
                 weightCol=None,
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 strategy="Cartesian",
                 maxRuntimeSecs=0.0,
                 maxModels=0,
                 seed=-1,
                 stoppingRounds=0,
                 stoppingTolerance=0.001,
                 stoppingMetric="AUTO",
                 nfolds=0,
                 selectBestModelBy=None,
                 selectBestModelDecreasing=True,
                 foldCol=None,
                 convertUnknownCategoricalLevelsToNa=True,
                 **deprecatedArgs):
        super(H2OGridSearch, self).__init__()
        self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                          verbose=False)
        self._java_obj = self._new_java_obj(
            "py_sparkling.ml.algos.H2OGridSearch", self.uid)

        self._setDefault(
            featuresCols=[],
            algo=None,
            splitRatio=1.0,
            hyperParameters={},
            labelCol="label",
            weightCol=None,
            allStringColumnsToCategorical=True,
            columnsToCategorical=[],
            strategy=self._hc._jvm.hex.grid.HyperSpaceSearchCriteria.Strategy.
            valueOf("Cartesian"),
            maxRuntimeSecs=0.0,
            maxModels=0,
            seed=-1,
            stoppingRounds=0,
            stoppingTolerance=0.001,
            stoppingMetric=self._hc._jvm.hex.ScoreKeeper.StoppingMetric.
            valueOf("AUTO"),
            nfolds=0,
            selectBestModelBy=None,
            selectBestModelDecreasing=True,
            foldCol=None,
            convertUnknownCategoricalLevelsToNa=True)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #12
0
    def __init__(self,
                 modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 epochs=10.0,
                 l1=0.0,
                 l2=0.0,
                 hidden=[200, 200],
                 reproducible=False,
                 convertUnknownCategoricalLevelsToNa=False,
                 foldCol=None,
                 **deprecatedArgs):
        super(H2ODeepLearning, self).__init__()
        self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                          verbose=False)
        self._java_obj = self._new_java_obj(
            "py_sparkling.ml.algos.H2ODeepLearning", self.uid)

        self._setDefault(modelId=None,
                         splitRatio=1.0,
                         labelCol="label",
                         weightCol=None,
                         featuresCols=[],
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         nfolds=0,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         seed=-1,
                         distribution=self._hc._jvm.hex.genmodel.utils.
                         DistributionFamily.valueOf("AUTO"),
                         epochs=10.0,
                         l1=0.0,
                         l2=0.0,
                         hidden=[200, 200],
                         reproducible=False,
                         convertUnknownCategoricalLevelsToNa=False,
                         foldCol=None)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #13
0
    def setParams(self,
                  featuresCols=[],
                  algo=None,
                  splitRatio=1.0,
                  hyperParameters={},
                  labelCol="label",
                  weightCol=None,
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  strategy="Cartesian",
                  maxRuntimeSecs=0.0,
                  maxModels=0,
                  seed=-1,
                  stoppingRounds=0,
                  stoppingTolerance=0.001,
                  stoppingMetric="AUTO",
                  nfolds=0,
                  selectBestModelBy="AUTO",
                  selectBestModelDecreasing=True,
                  foldCol=None,
                  convertUnknownCategoricalLevelsToNa=True,
                  predictionCol="prediction",
                  detailedPredictionCol="detailed_prediction",
                  withDetailedPredictionCol=False,
                  convertInvalidNumbersToNa=False,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        validateEnumValue(self._H2OGridSearchParams__getStrategyEnum(), kwargs,
                          "strategy")
        validateEnumValue(self._H2OGridSearchParams__getStoppingMetricEnum(),
                          kwargs, "stoppingMetric")
        validateEnumValue(
            self._H2OGridSearchParams__getSelectBestModelByEnum(), kwargs,
            "selectBestModelBy")

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = ["splitRatio", "stoppingTolerance", "maxRuntimeSecs"]
        set_double_values(kwargs, double_types)
        if "algo" in kwargs and kwargs["algo"] is not None:
            tmp = kwargs["algo"]
            del kwargs['algo']
            self._java_obj.setAlgo(tmp._java_obj)

        return self._set(**kwargs)
Example #14
0
    def setParams(self,
                  predictionCol="prediction",
                  detailedPredictionCol="detailed_prediction",
                  withDetailedPredictionCol=False,
                  featuresCols=[],
                  foldCol=None,
                  weightCol=None,
                  splitRatio=1.0,
                  seed=-1,
                  nfolds=0,
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  convertUnknownCategoricalLevelsToNa=False,
                  convertInvalidNumbersToNa=False,
                  modelId=None,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  distribution="AUTO",
                  maxIterations=10,
                  standardize=True,
                  init="Furthest",
                  userPoints=None,
                  estimateK=False,
                  k=2,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        validateEnumValue(self._H2OAlgoCommonParams__getDistributionEnum(),
                          kwargs, "distribution")
        validateEnumValue(self._H2OKMeansParams__getInitEnum(), kwargs, "init")

        # We need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = ["splitRatio"]
        set_double_values(kwargs, double_types)

        if "init" in kwargs:
            kwargs["init"] = getDoubleArrayArrayFromIntArrayArray(
                kwargs["init"])

        return self._set(**kwargs)
Example #15
0
    def setParams(self,
                  foldCol=None,
                  labelCol="label",
                  inputCols=[],
                  holdoutStrategy="None",
                  blendedAvgEnabled=False,
                  blendedAvgInflectionPoint=10.0,
                  blendedAvgSmoothing=20.0,
                  noise=0.01,
                  noiseSeed=-1):
        kwargs = get_input_kwargs(self)

        validateEnumValue(self.__getHoldoutStrategyEnumName(), kwargs,
                          "holdoutStrategy")

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "blendedAvgInflectionPoint", "blendedAvgSmoothing", "noise"
        ]
        set_double_values(kwargs, double_types)

        return self._set(**kwargs)
Example #16
0
    def setParams(self,
                  modelId=None,
                  splitRatio=1.0,
                  labelCol="label",
                  weightCol=None,
                  featuresCols=[],
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  nfolds=0,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  seed=-1,
                  distribution="AUTO",
                  epochs=10.0,
                  l1=0.0,
                  l2=0.0,
                  hidden=[200, 200],
                  reproducible=False,
                  convertUnknownCategoricalLevelsToNa=False,
                  foldCol=None,
                  predictionCol="prediction",
                  detailedPredictionCol="detailed_prediction",
                  withDetailedPredictionCol=False,
                  convertInvalidNumbersToNa=False,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        validateEnumValue(self._H2OAlgoCommonParams__getDistributionEnum(),
                          kwargs, "distribution")

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = ["splitRatio", "epochs", "l1", "l2"]
        set_double_values(kwargs, double_types)

        return self._set(**kwargs)
Example #17
0
    def __init__(self,
                 modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 convertUnknownCategoricalLevelsToNa=False,
                 quietMode=True,
                 missingValuesHandling=None,
                 ntrees=50,
                 nEstimators=0,
                 maxDepth=6,
                 minRows=1.0,
                 minChildWeight=1.0,
                 learnRate=0.3,
                 eta=0.3,
                 learnRateAnnealing=1.0,
                 sampleRate=1.0,
                 subsample=1.0,
                 colSampleRate=1.0,
                 colSampleByLevel=1.0,
                 colSampleRatePerTree=1.0,
                 colsampleBytree=1.0,
                 maxAbsLeafnodePred=0.0,
                 maxDeltaStep=0.0,
                 scoreTreeInterval=0,
                 initialScoreInterval=4000,
                 scoreInterval=4000,
                 minSplitImprovement=0.0,
                 gamma=0.0,
                 nthread=-1,
                 maxBins=256,
                 maxLeaves=0,
                 minSumHessianInLeaf=100.0,
                 minDataInLeaf=0.0,
                 treeMethod="auto",
                 growPolicy="depthwise",
                 booster="gbtree",
                 dmatrixType="auto",
                 regLambda=0.0,
                 regAlpha=0.0,
                 sampleType="uniform",
                 normalizeType="tree",
                 rateDrop=0.0,
                 oneDrop=False,
                 skipDrop=0.0,
                 gpuId=0,
                 backend="auto",
                 foldCol=None,
                 **deprecatedArgs):
        super(H2OXGBoost, self).__init__()
        self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                          verbose=False)
        self._java_obj = self._new_java_obj("py_sparkling.ml.algos.H2OXGBoost",
                                            self.uid)

        self._setDefault(
            modelId=None,
            splitRatio=1.0,
            labelCol="label",
            weightCol=None,
            featuresCols=[],
            allStringColumnsToCategorical=True,
            columnsToCategorical=[],
            nfolds=0,
            keepCrossValidationPredictions=False,
            keepCrossValidationFoldAssignment=False,
            parallelizeCrossValidation=True,
            seed=-1,
            distribution=self._hc._jvm.hex.genmodel.utils.DistributionFamily.
            valueOf("AUTO"),
            convertUnknownCategoricalLevelsToNa=False,
            quietMode=True,
            missingValuesHandling=None,
            ntrees=50,
            nEstimators=0,
            maxDepth=6,
            minRows=1.0,
            minChildWeight=1.0,
            learnRate=0.3,
            eta=0.3,
            learnRateAnnealing=1.0,
            sampleRate=1.0,
            subsample=1.0,
            colSampleRate=1.0,
            colSampleByLevel=1.0,
            colSampleRatePerTree=1.0,
            colsampleBytree=1.0,
            maxAbsLeafnodePred=0.0,
            maxDeltaStep=0.0,
            scoreTreeInterval=0,
            initialScoreInterval=4000,
            scoreInterval=4000,
            minSplitImprovement=0.0,
            gamma=0.0,
            nthread=-1,
            maxBins=256,
            maxLeaves=0,
            minSumHessianInLeaf=100.0,
            minDataInLeaf=0.0,
            treeMethod=self._hc._jvm.hex.tree.xgboost.XGBoostModel.
            XGBoostParameters.TreeMethod.valueOf("auto"),
            growPolicy=self._hc._jvm.hex.tree.xgboost.XGBoostModel.
            XGBoostParameters.GrowPolicy.valueOf("depthwise"),
            booster=self._hc._jvm.hex.tree.xgboost.XGBoostModel.
            XGBoostParameters.Booster.valueOf("gbtree"),
            dmatrixType=self._hc._jvm.hex.tree.xgboost.XGBoostModel.
            XGBoostParameters.DMatrixType.valueOf("auto"),
            regLambda=0.0,
            regAlpha=0.0,
            sampleType=self._hc._jvm.hex.tree.xgboost.XGBoostModel.
            XGBoostParameters.DartSampleType.valueOf("uniform"),
            normalizeType=self._hc._jvm.hex.tree.xgboost.XGBoostModel.
            XGBoostParameters.DartNormalizeType.valueOf("tree"),
            rateDrop=0.0,
            oneDrop=False,
            skipDrop=0.0,
            gpuId=0,
            backend=self._hc._jvm.hex.tree.xgboost.XGBoostModel.
            XGBoostParameters.Backend.valueOf("auto"),
            foldCol=None)

        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #18
0
    def __init__(self,
                 modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 ntrees=50,
                 maxDepth=5,
                 minRows=10.0,
                 nbins=20,
                 nbinsCats=1024,
                 minSplitImprovement=1e-5,
                 histogramType="AUTO",
                 r2Stopping=java_max_double_value,
                 nbinsTopLevel=1 << 10,
                 buildTreeOneNode=False,
                 scoreTreeInterval=0,
                 sampleRate=1.0,
                 sampleRatePerClass=None,
                 colSampleRateChangePerLevel=1.0,
                 colSampleRatePerTree=1.0,
                 learnRate=0.1,
                 learnRateAnnealing=1.0,
                 colSampleRate=1.0,
                 maxAbsLeafnodePred=java_max_double_value,
                 predNoiseBandwidth=0.0,
                 convertUnknownCategoricalLevelsToNa=False,
                 foldCol=None,
                 **deprecatedArgs):
        super(H2OGBM, self).__init__()
        self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                          verbose=False)
        self._java_obj = self._new_java_obj("py_sparkling.ml.algos.H2OGBM",
                                            self.uid)

        self._setDefault(modelId=None,
                         splitRatio=1.0,
                         labelCol="label",
                         weightCol=None,
                         featuresCols=[],
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         nfolds=0,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         seed=-1,
                         distribution=self._hc._jvm.hex.genmodel.utils.
                         DistributionFamily.valueOf("AUTO"),
                         ntrees=50,
                         maxDepth=5,
                         minRows=10.0,
                         nbins=20,
                         nbinsCats=1024,
                         minSplitImprovement=1e-5,
                         histogramType=self._hc._jvm.hex.tree.SharedTreeModel.
                         SharedTreeParameters.HistogramType.valueOf("AUTO"),
                         r2Stopping=self._hc._jvm.Double.MAX_VALUE,
                         nbinsTopLevel=1 << 10,
                         buildTreeOneNode=False,
                         scoreTreeInterval=0,
                         sampleRate=1.0,
                         sampleRatePerClass=None,
                         colSampleRateChangePerLevel=1.0,
                         colSampleRatePerTree=1.0,
                         learnRate=0.1,
                         learnRateAnnealing=1.0,
                         colSampleRate=1.0,
                         maxAbsLeafnodePred=self._hc._jvm.Double.MAX_VALUE,
                         predNoiseBandwidth=0.0,
                         convertUnknownCategoricalLevelsToNa=False,
                         foldCol=None)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #19
0
 def setParams(self, keep=False, columns=[]):
     kwargs = get_input_kwargs(self)
     return self._set(**kwargs)
Example #20
0
    def __init__(self,
                 featuresCols=[],
                 labelCol="label",
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 splitRatio=1.0,
                 foldCol=None,
                 weightCol=None,
                 ignoredCols=[],
                 includeAlgos=None,
                 excludeAlgos=None,
                 projectName=None,
                 maxRuntimeSecs=3600.0,
                 stoppingRounds=3,
                 stoppingTolerance=0.001,
                 stoppingMetric="AUTO",
                 nfolds=5,
                 convertUnknownCategoricalLevelsToNa=True,
                 seed=-1,
                 sortMetric="AUTO",
                 balanceClasses=False,
                 classSamplingFactors=None,
                 maxAfterBalanceSize=5.0,
                 keepCrossValidationPredictions=True,
                 keepCrossValidationModels=True,
                 maxModels=0,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False,
                 **deprecatedArgs):
        super(H2OAutoML, self).__init__()
        self._java_obj = self._new_java_obj("py_sparkling.ml.algos.H2OAutoML",
                                            self.uid)

        self._setDefault(featuresCols=[],
                         labelCol="label",
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         splitRatio=1.0,
                         foldCol=None,
                         weightCol=None,
                         ignoredCols=[],
                         includeAlgos=None,
                         excludeAlgos=None,
                         projectName=None,
                         maxRuntimeSecs=3600.0,
                         stoppingRounds=3,
                         stoppingTolerance=0.001,
                         stoppingMetric="AUTO",
                         nfolds=5,
                         convertUnknownCategoricalLevelsToNa=True,
                         seed=-1,
                         sortMetric=None,
                         balanceClasses=False,
                         classSamplingFactors=None,
                         maxAfterBalanceSize=5.0,
                         keepCrossValidationPredictions=True,
                         keepCrossValidationModels=True,
                         maxModels=0,
                         predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)
        kwargs = get_input_kwargs(self)

        self.setParams(**kwargs)
Example #21
0
    def setParams(self,
                  modelId=None,
                  splitRatio=1.0,
                  labelCol="label",
                  weightCol=None,
                  featuresCols=[],
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  nfolds=0,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  seed=-1,
                  distribution="AUTO",
                  convertUnknownCategoricalLevelsToNa=False,
                  standardize=True,
                  family="gaussian",
                  link="family_default",
                  solver="AUTO",
                  tweedieVariancePower=0.0,
                  tweedieLinkPower=0.0,
                  alpha=None,
                  lambda_=None,
                  missingValuesHandling="MeanImputation",
                  prior=-1.0,
                  lambdaSearch=False,
                  nlambdas=-1,
                  nonNegative=False,
                  exactLambdas=False,
                  lambdaMinRatio=-1.0,
                  maxIterations=-1,
                  intercept=True,
                  betaEpsilon=1e-4,
                  objectiveEpsilon=-1.0,
                  gradientEpsilon=-1.0,
                  objReg=-1.0,
                  computePValues=False,
                  removeCollinearCols=False,
                  interactions=None,
                  interactionPairs=None,
                  earlyStopping=True,
                  foldCol=None,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        if "distribution" in kwargs:
            kwargs[
                "distribution"] = self._hc._jvm.hex.genmodel.utils.DistributionFamily.valueOf(
                    kwargs["distribution"])

        if "family" in kwargs:
            kwargs[
                "family"] = self._hc._jvm.hex.glm.GLMModel.GLMParameters.Family.valueOf(
                    kwargs["family"])

        if "link" in kwargs:
            kwargs[
                "link"] = self._hc._jvm.hex.glm.GLMModel.GLMParameters.Link.valueOf(
                    kwargs["link"])

        if "solver" in kwargs:
            kwargs[
                "solver"] = self._hc._jvm.hex.glm.GLMModel.GLMParameters.Solver.valueOf(
                    kwargs["solver"])

        if "missingValuesHandling" in kwargs:
            kwargs[
                "missingValuesHandling"] = self._hc._jvm.hex.deeplearning.DeepLearningModel.DeepLearningParameters.MissingValuesHandling.valueOf(
                    kwargs["missingValuesHandling"])

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "splitRatio", "tweedieVariancePower", "tweedieLinkPower", "prior",
            "lambdaMinRatio", "betaEpsilon", "objectiveEpsilon",
            "gradientEpsilon", "objReg"
        ]
        set_double_values(kwargs, double_types)

        # We need to also map all doubles in the arrays
        if "alpha" in kwargs:
            kwargs["alpha"] = map(float, kwargs["alpha"])

        if "lambda_" in kwargs:
            kwargs["lambda_"] = map(float, kwargs["lambda_"])

        return self._set(**kwargs)
Example #22
0
    def setParams(self,
                  modelId=None,
                  splitRatio=1.0,
                  labelCol="label",
                  weightCol=None,
                  featuresCols=[],
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  nfolds=0,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  seed=-1,
                  distribution="AUTO",
                  ntrees=50,
                  maxDepth=5,
                  minRows=10.0,
                  nbins=20,
                  nbinsCats=1024,
                  minSplitImprovement=1e-5,
                  histogramType="AUTO",
                  r2Stopping=java_max_double_value,
                  nbinsTopLevel=1 << 10,
                  buildTreeOneNode=False,
                  scoreTreeInterval=0,
                  sampleRate=1.0,
                  sampleRatePerClass=None,
                  colSampleRateChangePerLevel=1.0,
                  colSampleRatePerTree=1.0,
                  learnRate=0.1,
                  learnRateAnnealing=1.0,
                  colSampleRate=1.0,
                  maxAbsLeafnodePred=java_max_double_value,
                  predNoiseBandwidth=0.0,
                  convertUnknownCategoricalLevelsToNa=False,
                  foldCol=None,
                  predictionCol="prediction",
                  detailedPredictionCol="detailed_prediction",
                  withDetailedPredictionCol=False,
                  convertInvalidNumbersToNa=False,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        validateEnumValue(self._H2OAlgoCommonParams__getDistributionEnum(),
                          kwargs, "distribution")
        validateEnumValue(self._H2OSharedTreeParams__getHistogramTypeEnum(),
                          kwargs, "histogramType")

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "minRows", "predNoiseBandwidth", "splitRatio", "learnRate",
            "colSampleRate", "learnRateAnnealing", "maxAbsLeafnodePred"
            "minSplitImprovement", "r2Stopping", "sampleRate",
            "colSampleRateChangePerLevel", "colSampleRatePerTree"
        ]
        set_double_values(kwargs, double_types)

        # We need to also map all doubles in the arrays
        arrayToDoubleArray("sampleRatePerClass", kwargs)

        return self._set(**kwargs)
Example #23
0
    def setParams(self,
                  modelId=None,
                  splitRatio=1.0,
                  labelCol="label",
                  weightCol=None,
                  featuresCols=[],
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  nfolds=0,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  seed=-1,
                  distribution="AUTO",
                  convertUnknownCategoricalLevelsToNa=False,
                  standardize=True,
                  family="gaussian",
                  link="family_default",
                  solver="AUTO",
                  tweedieVariancePower=0.0,
                  tweedieLinkPower=0.0,
                  alpha=None,
                  lambda_=None,
                  missingValuesHandling="MeanImputation",
                  prior=-1.0,
                  lambdaSearch=False,
                  nlambdas=-1,
                  nonNegative=False,
                  exactLambdas=False,
                  lambdaMinRatio=-1.0,
                  maxIterations=-1,
                  intercept=True,
                  betaEpsilon=1e-4,
                  objectiveEpsilon=-1.0,
                  gradientEpsilon=-1.0,
                  objReg=-1.0,
                  computePValues=False,
                  removeCollinearCols=False,
                  interactions=None,
                  interactionPairs=None,
                  earlyStopping=True,
                  foldCol=None,
                  predictionCol="prediction",
                  detailedPredictionCol="detailed_prediction",
                  withDetailedPredictionCol=False,
                  convertInvalidNumbersToNa=False,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        validateEnumValue(self._H2OAlgoCommonParams__getDistributionEnum(),
                          kwargs, "distribution")
        validateEnumValue(self._H2OGLMParams__getFamilyEnum(), kwargs,
                          "family")
        validateEnumValue(self._H2OGLMParams__getLinkEnum(), kwargs, "link")
        validateEnumValue(self._H2OGLMParams__getSolverEnum(), kwargs,
                          "solver")
        validateEnumValue(self._H2OGLMParams__getMissingValuesHandlingEnum(),
                          kwargs, "missingValuesHandling")

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "splitRatio", "tweedieVariancePower", "tweedieLinkPower", "prior",
            "lambdaMinRatio", "betaEpsilon", "objectiveEpsilon",
            "gradientEpsilon", "objReg"
        ]
        set_double_values(kwargs, double_types)

        # We need to also map all doubles in the arrays
        arrayToDoubleArray("alpha", kwargs)
        arrayToDoubleArray("lambda_", kwargs)

        return self._set(**kwargs)
Example #24
0
    def setParams(self,
                  modelId=None,
                  splitRatio=1.0,
                  labelCol="label",
                  weightCol=None,
                  featuresCols=[],
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  nfolds=0,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  seed=-1,
                  distribution="AUTO",
                  convertUnknownCategoricalLevelsToNa=False,
                  quietMode=True,
                  ntrees=50,
                  nEstimators=0,
                  maxDepth=6,
                  minRows=1.0,
                  minChildWeight=1.0,
                  learnRate=0.3,
                  eta=0.3,
                  learnRateAnnealing=1.0,
                  sampleRate=1.0,
                  subsample=1.0,
                  colSampleRate=1.0,
                  colSampleByLevel=1.0,
                  colSampleRatePerTree=1.0,
                  colsampleBytree=1.0,
                  maxAbsLeafnodePred=0.0,
                  maxDeltaStep=0.0,
                  scoreTreeInterval=0,
                  initialScoreInterval=4000,
                  scoreInterval=4000,
                  minSplitImprovement=0.0,
                  gamma=0.0,
                  nthread=-1,
                  maxBins=256,
                  maxLeaves=0,
                  minSumHessianInLeaf=100.0,
                  minDataInLeaf=0.0,
                  treeMethod="auto",
                  growPolicy="depthwise",
                  booster="gbtree",
                  dmatrixType="auto",
                  regLambda=0.0,
                  regAlpha=0.0,
                  sampleType="uniform",
                  normalizeType="tree",
                  rateDrop=0.0,
                  oneDrop=False,
                  skipDrop=0.0,
                  gpuId=0,
                  backend="auto",
                  foldCol=None,
                  predictionCol="prediction",
                  detailedPredictionCol="detailed_prediction",
                  withDetailedPredictionCol=False,
                  convertInvalidNumbersToNa=False,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        validateEnumValue(self._H2OAlgoCommonParams__getDistributionEnum(),
                          kwargs, "distribution")
        validateEnumValue(self._H2OXGBoostParams__getTreeMethodEnum(), kwargs,
                          "treeMethod")
        validateEnumValue(self._H2OXGBoostParams__getGrowPolicyEnum(), kwargs,
                          "growPolicy")
        validateEnumValue(self._H2OXGBoostParams__getBoosterEnum(), kwargs,
                          "booster")
        validateEnumValue(self._H2OXGBoostParams__getDmatrixTypeEnum(), kwargs,
                          "dmatrixType")
        validateEnumValue(self._H2OXGBoostParams__getSampleTypeEnum(), kwargs,
                          "sampleType")
        validateEnumValue(self._H2OXGBoostParams__getNormalizeTypeEnum(),
                          kwargs, "normalizeType")
        validateEnumValue(self._H2OXGBoostParams__getBackendEnum(), kwargs,
                          "backend")

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "splitRatio", "minRows", "minChildWeight", "learnRate", "eta",
            "learnRateAnnealing"
            "sampleRate", "subsample", "colSampleRate", "colSampleByLevel",
            "colSampleRatePerTree", "colsampleBytree", "maxAbsLeafnodePred",
            "maxDeltaStep", "minSplitImprovement", "gamma",
            "minSumHessianInLeaf", "minDataInLeaf", "regLambda", "regAlpha",
            "rateDrop", "skipDrop"
        ]
        set_double_values(kwargs, double_types)
        return self._set(**kwargs)
Example #25
0
    def __init__(self,
                 modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 convertUnknownCategoricalLevelsToNa=False,
                 quietMode=True,
                 ntrees=50,
                 nEstimators=0,
                 maxDepth=6,
                 minRows=1.0,
                 minChildWeight=1.0,
                 learnRate=0.3,
                 eta=0.3,
                 learnRateAnnealing=1.0,
                 sampleRate=1.0,
                 subsample=1.0,
                 colSampleRate=1.0,
                 colSampleByLevel=1.0,
                 colSampleRatePerTree=1.0,
                 colsampleBytree=1.0,
                 maxAbsLeafnodePred=0.0,
                 maxDeltaStep=0.0,
                 scoreTreeInterval=0,
                 initialScoreInterval=4000,
                 scoreInterval=4000,
                 minSplitImprovement=0.0,
                 gamma=0.0,
                 nthread=-1,
                 maxBins=256,
                 maxLeaves=0,
                 minSumHessianInLeaf=100.0,
                 minDataInLeaf=0.0,
                 treeMethod="auto",
                 growPolicy="depthwise",
                 booster="gbtree",
                 dmatrixType="auto",
                 regLambda=0.0,
                 regAlpha=0.0,
                 sampleType="uniform",
                 normalizeType="tree",
                 rateDrop=0.0,
                 oneDrop=False,
                 skipDrop=0.0,
                 gpuId=0,
                 backend="auto",
                 foldCol=None,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False,
                 **deprecatedArgs):
        super(H2OXGBoost, self).__init__()
        self._java_obj = self._new_java_obj("py_sparkling.ml.algos.H2OXGBoost",
                                            self.uid)

        self._setDefault(modelId=None,
                         splitRatio=1.0,
                         labelCol="label",
                         weightCol=None,
                         featuresCols=[],
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         nfolds=0,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         seed=-1,
                         distribution="AUTO",
                         convertUnknownCategoricalLevelsToNa=False,
                         quietMode=True,
                         ntrees=50,
                         nEstimators=0,
                         maxDepth=6,
                         minRows=1.0,
                         minChildWeight=1.0,
                         learnRate=0.3,
                         eta=0.3,
                         learnRateAnnealing=1.0,
                         sampleRate=1.0,
                         subsample=1.0,
                         colSampleRate=1.0,
                         colSampleByLevel=1.0,
                         colSampleRatePerTree=1.0,
                         colsampleBytree=1.0,
                         maxAbsLeafnodePred=0.0,
                         maxDeltaStep=0.0,
                         scoreTreeInterval=0,
                         initialScoreInterval=4000,
                         scoreInterval=4000,
                         minSplitImprovement=0.0,
                         gamma=0.0,
                         nthread=-1,
                         maxBins=256,
                         maxLeaves=0,
                         minSumHessianInLeaf=100.0,
                         minDataInLeaf=0.0,
                         treeMethod="auto",
                         growPolicy="depthwise",
                         booster="gbtree",
                         dmatrixType="auto",
                         regLambda=0.0,
                         regAlpha=0.0,
                         sampleType="uniform",
                         normalizeType="tree",
                         rateDrop=0.0,
                         oneDrop=False,
                         skipDrop=0.0,
                         gpuId=0,
                         backend="auto",
                         foldCol=None,
                         predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)

        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #26
0
    def setParams(self,
                  modelId=None,
                  splitRatio=1.0,
                  labelCol="label",
                  weightCol=None,
                  featuresCols=[],
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  nfolds=0,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  seed=-1,
                  distribution="AUTO",
                  convertUnknownCategoricalLevelsToNa=False,
                  quietMode=True,
                  missingValuesHandling=None,
                  ntrees=50,
                  nEstimators=0,
                  maxDepth=6,
                  minRows=1.0,
                  minChildWeight=1.0,
                  learnRate=0.3,
                  eta=0.3,
                  learnRateAnnealing=1.0,
                  sampleRate=1.0,
                  subsample=1.0,
                  colSampleRate=1.0,
                  colSampleByLevel=1.0,
                  colSampleRatePerTree=1.0,
                  colsampleBytree=1.0,
                  maxAbsLeafnodePred=0.0,
                  maxDeltaStep=0.0,
                  scoreTreeInterval=0,
                  initialScoreInterval=4000,
                  scoreInterval=4000,
                  minSplitImprovement=0.0,
                  gamma=0.0,
                  nthread=-1,
                  maxBins=256,
                  maxLeaves=0,
                  minSumHessianInLeaf=100.0,
                  minDataInLeaf=0.0,
                  treeMethod="auto",
                  growPolicy="depthwise",
                  booster="gbtree",
                  dmatrixType="auto",
                  regLambda=0.0,
                  regAlpha=0.0,
                  sampleType="uniform",
                  normalizeType="tree",
                  rateDrop=0.0,
                  oneDrop=False,
                  skipDrop=0.0,
                  gpuId=0,
                  backend="auto",
                  foldCol=None,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        if "distribution" in kwargs:
            kwargs[
                "distribution"] = self._hc._jvm.hex.genmodel.utils.DistributionFamily.valueOf(
                    kwargs["distribution"])

        if "treeMethod" in kwargs:
            kwargs[
                "treeMethod"] = self._hc._jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.TreeMethod.valueOf(
                    kwargs["treeMethod"])

        if "growPolicy" in kwargs:
            kwargs[
                "growPolicy"] = self._hc._jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.GrowPolicy.valueOf(
                    kwargs["growPolicy"])

        if "booster" in kwargs:
            kwargs[
                "booster"] = self._hc._jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.Booster.valueOf(
                    kwargs["booster"])

        if "dmatrixType" in kwargs:
            kwargs[
                "dmatrixType"] = self._hc._jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.DMatrixType.valueOf(
                    kwargs["dmatrixType"])

        if "sampleType" in kwargs:
            kwargs[
                "sampleType"] = self._hc._jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.DartSampleType.valueOf(
                    kwargs["sampleType"])

        if "normalizeType" in kwargs:
            kwargs[
                "normalizeType"] = self._hc._jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.DartNormalizeType.valueOf(
                    kwargs["normalizeType"])

        if "backend" in kwargs:
            kwargs[
                "backend"] = self._hc._jvm.hex.tree.xgboost.XGBoostModel.XGBoostParameters.Backend.valueOf(
                    kwargs["backend"])

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "splitRatio", "minRows", "minChildWeight", "learnRate", "eta",
            "learnRateAnnealing"
            "sampleRate", "subsample", "colSampleRate", "colSampleByLevel",
            "colSampleRatePerTree", "colsampleBytree", "maxAbsLeafnodePred",
            "maxDeltaStep", "minSplitImprovement", "gamma",
            "minSumHessianInLeaf", "minDataInLeaf", "regLambda", "regAlpha",
            "rateDrop", "skipDrop"
        ]
        set_double_values(kwargs, double_types)
        return self._set(**kwargs)
Example #27
0
    def __init__(self,
                 modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 convertUnknownCategoricalLevelsToNa=False,
                 standardize=True,
                 family="gaussian",
                 link="family_default",
                 solver="AUTO",
                 tweedieVariancePower=0.0,
                 tweedieLinkPower=0.0,
                 alpha=None,
                 lambda_=None,
                 missingValuesHandling="MeanImputation",
                 prior=-1.0,
                 lambdaSearch=False,
                 nlambdas=-1,
                 nonNegative=False,
                 exactLambdas=False,
                 lambdaMinRatio=-1.0,
                 maxIterations=-1,
                 intercept=True,
                 betaEpsilon=1e-4,
                 objectiveEpsilon=-1.0,
                 gradientEpsilon=-1.0,
                 objReg=-1.0,
                 computePValues=False,
                 removeCollinearCols=False,
                 interactions=None,
                 interactionPairs=None,
                 earlyStopping=True,
                 foldCol=None,
                 **deprecatedArgs):
        super(H2OGLM, self).__init__()
        self._hc = H2OContext.getOrCreate(SparkSession.builder.getOrCreate(),
                                          verbose=False)
        self._java_obj = self._new_java_obj("py_sparkling.ml.algos.H2OGLM",
                                            self.uid)

        self._setDefault(
            modelId=None,
            splitRatio=1.0,
            labelCol="label",
            weightCol=None,
            featuresCols=[],
            allStringColumnsToCategorical=True,
            columnsToCategorical=[],
            nfolds=0,
            keepCrossValidationPredictions=False,
            keepCrossValidationFoldAssignment=False,
            parallelizeCrossValidation=True,
            seed=-1,
            distribution=self._hc._jvm.hex.genmodel.utils.DistributionFamily.
            valueOf("AUTO"),
            convertUnknownCategoricalLevelsToNa=False,
            standardize=True,
            family=self._hc._jvm.hex.glm.GLMModel.GLMParameters.Family.valueOf(
                "gaussian"),
            link=self._hc._jvm.hex.glm.GLMModel.GLMParameters.Link.valueOf(
                "family_default"),
            solver=self._hc._jvm.hex.glm.GLMModel.GLMParameters.Solver.valueOf(
                "AUTO"),
            tweedieVariancePower=0.0,
            tweedieLinkPower=0.0,
            alpha=None,
            lambda_=None,
            missingValuesHandling=self._hc._jvm.hex.deeplearning.
            DeepLearningModel.DeepLearningParameters.MissingValuesHandling.
            valueOf("MeanImputation"),
            prior=-1.0,
            lambdaSearch=False,
            nlambdas=-1,
            nonNegative=False,
            exactLambdas=False,
            lambdaMinRatio=-1.0,
            maxIterations=-1,
            intercept=True,
            betaEpsilon=1e-4,
            objectiveEpsilon=-1.0,
            gradientEpsilon=-1.0,
            objReg=-1.0,
            computePValues=False,
            removeCollinearCols=False,
            interactions=None,
            interactionPairs=None,
            earlyStopping=True,
            foldCol=None)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #28
0
    def setParams(self,
                  modelId=None,
                  splitRatio=1.0,
                  labelCol="label",
                  weightCol=None,
                  featuresCols=[],
                  allStringColumnsToCategorical=True,
                  columnsToCategorical=[],
                  nfolds=0,
                  keepCrossValidationPredictions=False,
                  keepCrossValidationFoldAssignment=False,
                  parallelizeCrossValidation=True,
                  seed=-1,
                  distribution="AUTO",
                  ntrees=50,
                  maxDepth=5,
                  minRows=10.0,
                  nbins=20,
                  nbinsCats=1024,
                  minSplitImprovement=1e-5,
                  histogramType="AUTO",
                  r2Stopping=java_max_double_value,
                  nbinsTopLevel=1 << 10,
                  buildTreeOneNode=False,
                  scoreTreeInterval=0,
                  sampleRate=1.0,
                  sampleRatePerClass=None,
                  colSampleRateChangePerLevel=1.0,
                  colSampleRatePerTree=1.0,
                  learnRate=0.1,
                  learnRateAnnealing=1.0,
                  colSampleRate=1.0,
                  maxAbsLeafnodePred=java_max_double_value,
                  predNoiseBandwidth=0.0,
                  convertUnknownCategoricalLevelsToNa=False,
                  foldCol=None,
                  **deprecatedArgs):
        kwargs = get_input_kwargs(self)

        if "distribution" in kwargs:
            kwargs[
                "distribution"] = self._hc._jvm.hex.genmodel.utils.DistributionFamily.valueOf(
                    kwargs["distribution"])
        if "histogramType" in kwargs:
            kwargs[
                "histogramType"] = self._hc._jvm.hex.tree.SharedTreeModel.SharedTreeParameters.HistogramType.valueOf(
                    kwargs["histogramType"])

        # we need to convert double arguments manually to floats as if we assign integer to double, py4j thinks that
        # the whole type is actually int and we get class cast exception
        double_types = [
            "minRows", "predNoiseBandwidth", "splitRatio", "learnRate",
            "colSampleRate", "learnRateAnnealing", "maxAbsLeafnodePred"
            "minSplitImprovement", "r2Stopping", "sampleRate",
            "colSampleRateChangePerLevel", "colSampleRatePerTree"
        ]
        set_double_values(kwargs, double_types)

        # We need to also map all doubles in the arrays
        if "sampleRatePerClass" in kwargs:
            kwargs["sampleRatePerClass"] = map(float,
                                               kwargs["sampleRatePerClass"])

        return self._set(**kwargs)
Example #29
0
    def __init__(self,
                 modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 convertUnknownCategoricalLevelsToNa=False,
                 standardize=True,
                 family="gaussian",
                 link="family_default",
                 solver="AUTO",
                 tweedieVariancePower=0.0,
                 tweedieLinkPower=0.0,
                 alpha=None,
                 lambda_=None,
                 missingValuesHandling="MeanImputation",
                 prior=-1.0,
                 lambdaSearch=False,
                 nlambdas=-1,
                 nonNegative=False,
                 exactLambdas=False,
                 lambdaMinRatio=-1.0,
                 maxIterations=-1,
                 intercept=True,
                 betaEpsilon=1e-4,
                 objectiveEpsilon=-1.0,
                 gradientEpsilon=-1.0,
                 objReg=-1.0,
                 computePValues=False,
                 removeCollinearCols=False,
                 interactions=None,
                 interactionPairs=None,
                 earlyStopping=True,
                 foldCol=None,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False,
                 **deprecatedArgs):
        super(H2OGLM, self).__init__()
        self._java_obj = self._new_java_obj("py_sparkling.ml.algos.H2OGLM",
                                            self.uid)

        self._setDefault(modelId=None,
                         splitRatio=1.0,
                         labelCol="label",
                         weightCol=None,
                         featuresCols=[],
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         nfolds=0,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         seed=-1,
                         distribution="AUTO",
                         convertUnknownCategoricalLevelsToNa=False,
                         standardize=True,
                         family="gaussian",
                         link="family_default",
                         solver="AUTO",
                         tweedieVariancePower=0.0,
                         tweedieLinkPower=0.0,
                         alpha=None,
                         lambda_=None,
                         missingValuesHandling="MeanImputation",
                         prior=-1.0,
                         lambdaSearch=False,
                         nlambdas=-1,
                         nonNegative=False,
                         exactLambdas=False,
                         lambdaMinRatio=-1.0,
                         maxIterations=-1,
                         intercept=True,
                         betaEpsilon=1e-4,
                         objectiveEpsilon=-1.0,
                         gradientEpsilon=-1.0,
                         objReg=-1.0,
                         computePValues=False,
                         removeCollinearCols=False,
                         interactions=None,
                         interactionPairs=None,
                         earlyStopping=True,
                         foldCol=None,
                         predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #30
0
    def __init__(self,
                 modelId=None,
                 splitRatio=1.0,
                 labelCol="label",
                 weightCol=None,
                 featuresCols=[],
                 allStringColumnsToCategorical=True,
                 columnsToCategorical=[],
                 nfolds=0,
                 keepCrossValidationPredictions=False,
                 keepCrossValidationFoldAssignment=False,
                 parallelizeCrossValidation=True,
                 seed=-1,
                 distribution="AUTO",
                 ntrees=50,
                 maxDepth=5,
                 minRows=10.0,
                 nbins=20,
                 nbinsCats=1024,
                 minSplitImprovement=1e-5,
                 histogramType="AUTO",
                 r2Stopping=java_max_double_value,
                 nbinsTopLevel=1 << 10,
                 buildTreeOneNode=False,
                 scoreTreeInterval=0,
                 sampleRate=1.0,
                 sampleRatePerClass=None,
                 colSampleRateChangePerLevel=1.0,
                 colSampleRatePerTree=1.0,
                 learnRate=0.1,
                 learnRateAnnealing=1.0,
                 colSampleRate=1.0,
                 maxAbsLeafnodePred=java_max_double_value,
                 predNoiseBandwidth=0.0,
                 convertUnknownCategoricalLevelsToNa=False,
                 foldCol=None,
                 predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction",
                 withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False,
                 **deprecatedArgs):
        super(H2OGBM, self).__init__()
        self._java_obj = self._new_java_obj("py_sparkling.ml.algos.H2OGBM",
                                            self.uid)

        self._setDefault(modelId=None,
                         splitRatio=1.0,
                         labelCol="label",
                         weightCol=None,
                         featuresCols=[],
                         allStringColumnsToCategorical=True,
                         columnsToCategorical=[],
                         nfolds=0,
                         keepCrossValidationPredictions=False,
                         keepCrossValidationFoldAssignment=False,
                         parallelizeCrossValidation=True,
                         seed=-1,
                         distribution="AUTO",
                         ntrees=50,
                         maxDepth=5,
                         minRows=10.0,
                         nbins=20,
                         nbinsCats=1024,
                         minSplitImprovement=1e-5,
                         histogramType="AUTO",
                         r2Stopping=_jvm().Double.MAX_VALUE,
                         nbinsTopLevel=1 << 10,
                         buildTreeOneNode=False,
                         scoreTreeInterval=0,
                         sampleRate=1.0,
                         sampleRatePerClass=None,
                         colSampleRateChangePerLevel=1.0,
                         colSampleRatePerTree=1.0,
                         learnRate=0.1,
                         learnRateAnnealing=1.0,
                         colSampleRate=1.0,
                         maxAbsLeafnodePred=_jvm().Double.MAX_VALUE,
                         predNoiseBandwidth=0.0,
                         convertUnknownCategoricalLevelsToNa=False,
                         foldCol=None,
                         predictionCol="prediction",
                         detailedPredictionCol="detailed_prediction",
                         withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)