Example #1
0
 def __init__(self,
              labels,
              inputCol=None,
              outputCol=None,
              handleInvalid='error',
              defaultValue=0.0):
     """
     __init__(self, labels, inputCol=None, outputCol=None, handleInvalid='error', defaultValue=0.0)
     labels is a dict {string: double}
     handleInvalid: how to handle missing labels: 'error' (throw an error), or 'keep' (map to the default value)
     """
     assert handleInvalid in [
         'error', 'keep'
     ], 'Invalid value for handleInvalid: {}'.format(handleInvalid)
     super(StringMap, self).__init__()
     labels_scala_map = _jvm() \
         .scala \
         .collection \
         .JavaConverters \
         .mapAsScalaMapConverter(labels) \
         .asScala() \
         .toMap(_jvm().scala.Predef.conforms())
     handle_invalid_jvm = _jvm(
     ).ml.combust.mleap.core.feature.StringMapHandleInvalid.__getattr__(
         handleInvalid.capitalize() + '$').__getattr__('MODULE$')
     string_map_model = self._new_java_obj(
         "ml.combust.mleap.core.feature.StringMapModel", labels_scala_map,
         handle_invalid_jvm, defaultValue)
     self._java_obj = self._new_java_obj(
         "org.apache.spark.ml.mleap.feature.StringMap", self.uid,
         string_map_model)
     self.setInputCol(inputCol)
     self.setOutputCol(outputCol)
Example #2
0
def testLongRDDToH2OFrame(spark, hc):
    min = _jvm().Integer.MIN_VALUE - 1
    max = _jvm().Integer.MAX_VALUE + 1
    rdd = spark.sparkContext.parallelize([1, min, max])
    h2o_frame = hc.asH2OFrame(rdd)
    assert h2o_frame[0, 0] == 1
    assert h2o_frame[1, 0] == min
    assert h2o_frame[2, 0] == max
    unit_test_utils.asert_h2o_frame(h2o_frame, rdd)
Example #3
0
def testNumericRDDtoH2OFrameWithValueTooBig(spark, hc):
    min = _jvm().Long.MIN_VALUE - 1
    max = _jvm().Long.MAX_VALUE + 1
    rdd = spark.sparkContext.parallelize([1, min, max])
    h2o_frame = hc.asH2OFrame(rdd)
    assert h2o_frame[0, 0] == str(1)
    assert h2o_frame[1, 0] == str(min)
    assert h2o_frame[2, 0] == str(max)
    unit_test_utils.asert_h2o_frame(h2o_frame, rdd)
Example #4
0
    def __init__(self,
                 labels={},
                 inputCol=None,
                 outputCol=None,
                 handleInvalid='error',
                 defaultValue=0.0):
        """
        :param labels: a dict {string: double}
        :param handleInvalid: how to handle missing labels: 'error' (throw), or 'keep' (map to defaultValue)
        :param defaultValue: value to use if key is not found in labels
        """
        """
        labels must be a dict {string: double} or a spark DataFrame with columns inputCol & outputCol
        handleInvalid: 
        """
        super(StringMap, self).__init__()

        def validate_args():
            """
            validate args early to avoid failing at Py4j with some hard to interpret error message
            """
            assert handleInvalid in [
                'error', 'keep'
            ], 'Invalid value for handleInvalid: {}'.format(handleInvalid)
            assert isinstance(labels,
                              dict), 'labels must be a dict, got: {}'.format(
                                  type(labels))
            for (key, value) in labels.items():
                assert isinstance(key, six.string_types), \
                    'label keys must be a string type, got: {}'.format(type(key))
                assert isinstance(
                    value,
                    float), 'label values must be float, got: {}'.format(
                        type(key))

        validate_args()

        labels_scala_map = _jvm() \
            .scala \
            .collection \
            .JavaConverters \
            .mapAsScalaMapConverter(labels) \
            .asScala() \
            .toMap(_jvm().scala.Predef.conforms())
        handle_invalid_jvm = _jvm(
        ).ml.combust.mleap.core.feature.StringMapHandleInvalid.__getattr__(
            handleInvalid.capitalize() + '$').__getattr__('MODULE$')
        string_map_model = self._new_java_obj(
            "ml.combust.mleap.core.feature.StringMapModel", labels_scala_map,
            handle_invalid_jvm, defaultValue)
        self._java_obj = self._new_java_obj(
            "org.apache.spark.ml.mleap.feature.StringMap", self.uid,
            string_map_model)
        self.setInputCol(inputCol)
        self.setOutputCol(outputCol)
Example #5
0
    def __init__(
        self,
        operation=None,
        inputA=None,
        inputB=None,
        outputCol=None,
    ):
        """
        Computes the mathematical binary `operation` over
        the input columns A and B.

        :param operation: BinaryOperation to specify the operation type
        :param inputA: column name for the left side of operation (string)
        :param inputB: column name for the right side of operation (string)
        :param outputCol: output column name (string)

        NOTE: `operation` is not a JavaParam because the underlying MathBinary
        scala object uses a MathBinaryModel to store the info about the binary
        operation.

        `operation` has a None default value even though it should *never* be
        None. A None value is necessary upon deserialization to instantiate a
        MathBinary without errors. Afterwards, pyspark sets the _java_obj to
        the deserialized scala object, which encodes the operation.
        """
        super(MathBinary, self).__init__()

        # if operation=None, it means that pyspark is reloading the model
        # from disk and calling this method without args. In such case we don't
        # need to set _java_obj here because pyspark will set it after creation
        #
        # if operation is not None, we can proceed to instantiate the scala classes
        if operation:
            scalaBinaryOperation = jvm_scala_object(
                _jvm().ml.combust.mleap.core.feature.BinaryOperation,
                operation.name)

            # IMPORTANT: defaults for missing values are forced to None.
            # I've found an issue when setting default values for A and B,
            # Remember to treat your missing values before the MathBinary
            # (for example, you could use an Imputer)
            scalaMathBinaryModel = _jvm(
            ).ml.combust.mleap.core.feature.MathBinaryModel(
                scalaBinaryOperation, Some(None), Some(None))

            self._java_obj = self._new_java_obj(
                "org.apache.spark.ml.mleap.feature.MathBinary",
                self.uid,
                scalaMathBinaryModel,
            )

        self._setDefault()
        self.setParams(inputA=inputA, inputB=inputB, outputCol=outputCol)
Example #6
0
 def convert(value):
     if value is None:
         raise TypeError("None is not allowed.")
     elif isinstance(value, JavaObject):
         return value
     elif isinstance(value, DenseVector):
         package = getattr(_jvm().ai.h2o.sparkling.ml.params, "ConversionUtils$")
         module = package.__getattr__("MODULE$")
         return _jvm().org.apache.spark.ml.linalg.DenseVector(
             module.toDoubleArray(H2OTypeConverters.toListFloat()(value.values)))
     else:
         raise TypeError("Invalid type. The expected type is pyspark.ml.linalg.DenseVector.")
Example #7
0
    def __init__(self, modelId=None, splitRatio=1.0, labelCol="label", weightCol=None, featuresCols=[], allStringColumnsToCategorical=True, columnsToCategorical=[],
                 nfolds=0, keepCrossValidationPredictions=False, keepCrossValidationFoldAssignment=False, parallelizeCrossValidation=True,
                 seed=-1, distribution="AUTO", ntrees=50, maxDepth=5, minRows=10.0, nbins=20, nbinsCats=1024, minSplitImprovement=1e-5,
                 histogramType="AUTO", r2Stopping=java_max_double_value,
                 nbinsTopLevel=1<<10, buildTreeOneNode=False, scoreTreeInterval=0,
                 sampleRate=1.0, sampleRatePerClass=None, colSampleRateChangePerLevel=1.0, colSampleRatePerTree=1.0,
                 learnRate=0.1, learnRateAnnealing=1.0, colSampleRate=1.0, maxAbsLeafnodePred=java_max_double_value,
                 predNoiseBandwidth=0.0, convertUnknownCategoricalLevelsToNa=False, foldCol=None, predictionCol="prediction",
                 detailedPredictionCol="detailed_prediction", withDetailedPredictionCol=False,
                 convertInvalidNumbersToNa=False, **deprecatedArgs):
        Initializer.load_sparkling_jar()
        super(H2OGBM, self).__init__()
        self._java_obj = self._new_java_obj("ai.h2o.sparkling.ml.algos.H2OGBM", self.uid)

        self._setDefault(modelId=None, splitRatio=1.0, labelCol="label", weightCol=None, featuresCols=[], allStringColumnsToCategorical=True, columnsToCategorical=[],
                         nfolds=0, keepCrossValidationPredictions=False, keepCrossValidationFoldAssignment=False, parallelizeCrossValidation=True,
                         seed=-1, distribution="AUTO",
                         ntrees=50, maxDepth=5, minRows=10.0, nbins=20, nbinsCats=1024, minSplitImprovement=1e-5,
                         histogramType="AUTO",
                         r2Stopping=_jvm().Double.MAX_VALUE, nbinsTopLevel=1<<10, buildTreeOneNode=False, scoreTreeInterval=0,
                         sampleRate=1.0, sampleRatePerClass=None, colSampleRateChangePerLevel=1.0, colSampleRatePerTree=1.0,
                         learnRate=0.1, learnRateAnnealing=1.0, colSampleRate=1.0, maxAbsLeafnodePred=_jvm().Double.MAX_VALUE,
                         predNoiseBandwidth=0.0, convertUnknownCategoricalLevelsToNa=False, foldCol=None,
                         predictionCol="prediction", detailedPredictionCol="detailed_prediction", withDetailedPredictionCol=False,
                         convertInvalidNumbersToNa=False)
        kwargs = get_input_kwargs(self)
        self.setParams(**kwargs)
Example #8
0
def _get_mleap_schema(dataframe):
    """
    :param dataframe: A PySpark dataframe object

    :return: The schema of the supplied dataframe, in MLeap format. This serialized object of type
    `ml.combust.mleap.core.types.StructType`, represented as a JSON dictionary.
    """
    from pyspark.ml.util import _jvm
    ReflectionUtil = _jvm().py4j.reflection.ReflectionUtil

    # Convert the Spark dataframe's schema to an MLeap schema object.
    # This is equivalent to the Scala function call
    # `org.apache.spark.sql.mleap.TypeConverters.sparkSchemaToMleapSchema(dataframe)`
    tc_clazz = ReflectionUtil.classForName(
        "org.apache.spark.sql.mleap.TypeConverters$")
    tc_inst = tc_clazz.getField("MODULE$").get(tc_clazz)
    mleap_schema_struct = tc_inst.sparkSchemaToMleapSchema(dataframe._jdf)

    # Obtain a JSON representation of the MLeap schema object
    # This is equivalent to the Scala function call
    # `ml.combust.mleap.json.JsonSupport.MleapStructTypeFormat().write(mleap_schema_struct)`
    js_clazz = ReflectionUtil.classForName(
        "ml.combust.mleap.json.JsonSupport$")
    js_inst = js_clazz.getField("MODULE$").get(js_clazz)
    mleap_schema_json = js_inst.MleapStructTypeFormat().write(
        mleap_schema_struct)
    return json.loads(mleap_schema_json.toString())
Example #9
0
 def convert(value):
     package = getattr(_jvm().ai.h2o.sparkling.ml.params,
                       "EnumParamValidator$")
     module = package.__getattr__("MODULE$")
     return module.getValidatedEnumValue(
         enumClass,
         H2OTypeConverters.toString()(value))
Example #10
0
 def convert(value):
     javaObj = H2OTypeConverters.toJavaObj()(value)
     if javaObj is None:
         return None
     else:
         package = getattr(_jvm().ai.h2o.sparkling.ml.algos, "H2OGridSearch$SupportedAlgos$")
         module = package.__getattr__("MODULE$")
         module.checkIfSupported(javaObj)
         return javaObj
Example #11
0
    def getOrCreate(spark=None, conf=None):
        """
        Get existing or create new H2OContext based on provided H2O configuration. If the conf parameter is set then
        configuration from it is used. Otherwise the configuration properties passed to Sparkling Water are used.
        If the values are not found the default values are used in most of the cases. The default cluster mode
        is internal, ie. spark.ext.h2o.external.cluster.mode=false

        :param spark: Spark Context or Spark Session or H2OConf
        :param conf: H2O configuration as instance of H2OConf
        :return:  instance of H2OContext
        """

        if spark is not None and not isinstance(spark, H2OConf):
            warnings.warn(
                "Method getOrCreate with spark argument is deprecated. Please use either just getOrCreate() or if you need "
                "to pass extra H2OConf, use getOrCreate(conf). The spark argument will be removed in release 3.32."
            )

        # Workaround for bug in Spark 2.1 as SparkSession created in PySpark is not seen in Java
        # and call SparkSession.builder.getOrCreate on Java side creates a new session, which is not
        # desirable
        activeSession = SparkSession._instantiatedSession
        if activeSession is not None:
            jvm = activeSession.sparkContext._jvm
            jvm.org.apache.spark.sql.SparkSession.setDefaultSession(
                activeSession._jsparkSession)

        if spark is not None and isinstance(spark, H2OConf):
            selected_conf = spark
        elif conf is not None:
            selected_conf = conf
        else:
            selected_conf = H2OConf()
        if selected_conf.runsInExternalClusterMode():
            selected_conf.set("spark.ext.h2o.rest.api.based.client", "true")

        h2o_context = H2OContext()

        # Create backing H2OContext
        package = getattr(_jvm().org.apache.spark.h2o, "H2OContext$")
        module = package.__getattr__("MODULE$")
        jhc = module.getOrCreate(selected_conf._jconf)
        h2o_context._jhc = jhc
        h2o_context._conf = selected_conf
        h2o_context._client_ip = jhc.h2oLocalClientIp()
        h2o_context._client_port = jhc.h2oLocalClientPort()

        # Create H2O REST API client
        if not h2o_context.__isClientConnected(
        ) or not H2OContext.__isConnected:
            h2o_context.__h2o_connect()
            H2OContext.__isConnected = True
            h2o_context.__setClientConnected()
            print(h2o_context)

        return h2o_context
Example #12
0
 def _new_java_obj(java_class, *args):
     """
     Returns a new Java object.
     """
     sc = SparkContext._active_spark_context
     java_obj = _jvm()
     for name in java_class.split("."):
         java_obj = getattr(java_obj, name)
     java_args = [_py2java(sc, arg) for arg in args]
     return java_obj(*java_args)
Example #13
0
 def _new_java_obj(java_class, *args):
     """
     Returns a new Java object.
     """
     sc = SparkContext._active_spark_context
     java_obj = _jvm()
     for name in java_class.split("."):
         java_obj = getattr(java_obj, name)
     java_args = [_py2java(sc, arg) for arg in args]
     return java_obj(*java_args)
Example #14
0
 def __init__(self, spark=None):
     try:
         if spark is not None:
             warnings.warn(
                 "Constructor H2OConf(spark) with spark argument is deprecated. Please use just H2OConf(). "
                 "The argument will be removed in release 3.32.")
         Initializer.load_sparkling_jar()
         self._jconf = _jvm().org.apache.spark.h2o.H2OConf()
     except:
         raise
Example #15
0
    def _new_java_obj(java_class: str, *args: Any) -> "JavaObject":
        """
        Returns a new Java object.
        """
        sc = SparkContext._active_spark_context
        assert sc is not None

        java_obj = _jvm()
        for name in java_class.split("."):
            java_obj = getattr(java_obj, name)
        java_args = [_py2java(sc, arg) for arg in args]
        return java_obj(*java_args)
Example #16
0
 def createFromMojo(pathToMojo, settings=H2OMOJOSettings.default()):
     # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths
     Initializer.load_sparkling_jar()
     javaModel = _jvm(
     ).ai.h2o.sparkling.ml.models.H2OMOJOModel.createFromMojo(
         pathToMojo, settings.toJavaObject())
     className = javaModel.getClass().getSimpleName()
     if className == "H2OSupervisedMOJOModel":
         return H2OSupervisedMOJOModel(javaModel)
     elif className == "H2OUnsupervisedMOJOModel":
         return H2OUnsupervisedMOJOModel(javaModel)
     else:
         return H2OMOJOModel(javaModel)
 def __prepareSparkDataForConversion(sparkData):
     if isinstance(sparkData, DataFrame):
         return sparkData
     elif sparkData.isEmpty():
         return sparkData.toDF()
     else:
         session = SparkSession.builder.getOrCreate()
         first = sparkData.first()
         if isinstance(first, (str, bool, numbers.Integral, float)):
             if isinstance(first, str):
                 return session.createDataFrame(sparkData, StringType())
             elif isinstance(first, bool):
                 return session.createDataFrame(sparkData, BooleanType())
             elif (isinstance(sparkData.min(), numbers.Integral)
                   and isinstance(sparkData.max(), numbers.Integral)):
                 if sparkData.min(
                 ) >= _jvm().Integer.MIN_VALUE and sparkData.max() <= _jvm(
                 ).Integer.MAX_VALUE:
                     return session.createDataFrame(sparkData,
                                                    IntegerType())
                 elif sparkData.min() >= _jvm(
                 ).Long.MIN_VALUE and sparkData.max() <= _jvm(
                 ).Long.MAX_VALUE:
                     return session.createDataFrame(sparkData, LongType())
                 else:
                     warnings.warn(
                         "Maximal or minimal number in RDD is too big to convert to Java. Treating numbers as strings."
                     )
                     return session.createDataFrame(sparkData, StringType())
             elif isinstance(first, float):
                 ## Spark would fail when creating data frame if there is int type in RDD[Float]
                 ## Convert explicitly all to float
                 return session.createDataFrame(
                     sparkData.map(lambda x: float(x)), FloatType())
             else:
                 raise ValueError('Unreachable code')
         else:
             return session.createDataFrame(sparkData)
Example #18
0
        def convert(value):
            package = getattr(_jvm().ai.h2o.sparkling.ml.params, "H2OAlgoParamsHelper$")
            module = package.__getattr__("MODULE$")
            if nullEnabled:
                converter = H2OTypeConverters.toNullableListString()
            else:
                converter = H2OTypeConverters.toListString()

            javaArray = module.getValidatedEnumValues(enumClass, converter(value), nullEnabled)

            if javaArray is None:
                return None
            else:
                return list(javaArray)
Example #19
0
    def __init__(self, operation=None, inputCol=None, outputCol=None):
        """
        Computes the mathematical unary `operation` over the input column.

        NOTE: `operation` is not a JavaParam because the underlying 
        MathUnary scala object uses a MathUnaryModel to store the info about
        the unary operation (sin, tan, etc.), not a JavaParam string.

        `operation` has a None default value even though it should *never* be
        None. A None value is necessary upon deserialization to instantiate a
        MathUnary without errors. Afterwards, pyspark sets the _java_obj to
        the deserialized scala object, which encodes the operation.
        """
        super(MathUnary, self).__init__()

        # if operation=None, it means that pyspark is reloading the model
        # from disk and calling this method without args. In such case we don't
        # need to set _java_obj here because pyspark will set it after creation
        #
        # if operation is not None, we can proceed to instantiate the scala classes
        if operation:
            scalaUnaryOperation = jvm_scala_object(
                _jvm().ml.combust.mleap.core.feature.UnaryOperation,
                operation.name)

            scalaMathUnaryModel = _jvm(
            ).ml.combust.mleap.core.feature.MathUnaryModel(scalaUnaryOperation)

            self._java_obj = self._new_java_obj(
                "org.apache.spark.ml.mleap.feature.MathUnary",
                self.uid,
                scalaMathUnaryModel,
            )

        self._setDefault()
        self.setParams(inputCol=inputCol, outputCol=outputCol)
Example #20
0
def ScalaNone():
    return jvm_scala_object(_jvm().scala, "None")
Example #21
0
    def __init__(
        self,
        operation=None,
        inputA=None,
        inputB=None,
        outputCol=None,
        defaultA=None,
        defaultB=None,
    ):
        """
        Computes the mathematical binary `operation` over
        the input columns A and B.

        :param operation: BinaryOperation to specify the operation type
        :param inputA: column name for the left side of operation (string)
        :param inputB: column name for the right side of operation (string)
        :param outputCol: output column name (string)
        :param defaultA: Default to use instead of inputA. This will only be used
         when inputA is None. For example when defaultA=4,
         operation=BinaryOperation.Multiply and inputB=f1, then all entries of
         col f1 will be multiplied by 4.
        :param defaultB: Default to use instead of inputB. This will only be used
         when inputB is None. For example when defaultB=4,
         operation=BinaryOperation.Multiply and inputA=f1, then all entries of
         col f1 will be multiplied by 4.

        NOTE: `operation`, `defaultA`, `defaultB` is not a JavaParam because
        the underlying MathBinary scala object uses a MathBinaryModel to store
        the info about the binary operation.

        `operation` has a None default value even though it should *never* be
        None. A None value is necessary upon deserialization to instantiate a
        MathBinary without errors. Afterwards, pyspark sets the _java_obj to
        the deserialized scala object, which encodes the operation (as well
        as the default values for A and B).
        """
        super(MathBinary, self).__init__()

        # if operation=None, it means that pyspark is reloading the model
        # from disk and calling this method without args. In such case we don't
        # need to set _java_obj here because pyspark will set it after creation
        #
        # if operation is not None, we can proceed to instantiate the scala classes
        if operation:
            scalaBinaryOperation = jvm_scala_object(
                _jvm().ml.combust.mleap.core.feature.BinaryOperation,
                operation.name)

            scalaMathBinaryModel = _jvm(
            ).ml.combust.mleap.core.feature.MathBinaryModel(
                scalaBinaryOperation,
                Some(defaultA) if defaultA else ScalaNone(),
                Some(defaultB) if defaultB else ScalaNone(),
            )
            self._java_obj = self._new_java_obj(
                "org.apache.spark.ml.mleap.feature.MathBinary",
                self.uid,
                scalaMathBinaryModel,
            )

        self._setDefault()
        self.setParams(inputA=inputA, inputB=inputB, outputCol=outputCol)
 def createFromMojo(pathToMojo, settings=H2OMOJOSettings.default()):
     # We need to make sure that Sparkling Water classes are available on the Spark driver and executor paths
     Initializer.load_sparkling_jar()
     javaModel = _jvm().ai.h2o.sparkling.ml.models.H2OMOJOPipelineModel.createFromMojo(pathToMojo,
                                                                                       settings.toJavaObject())
     return H2OMOJOPipelineModel(javaModel)
Example #23
0
def Some(value):
    """
    Instantiate a scala Some object. Useful when scala code takes in
    an Option[<value>]
    """
    return _jvm().scala.Some(value)
Example #24
0
 def fromBytes(cls, bytes_array):
     """
     Constructs a score model from PMML in an array of bytes.
     """
     java_model = _jvm().org.pmml4s.spark.ScoreModel.fromBytes(bytes_array)
     return cls(java_model)
Example #25
0
 def fromString(cls, s):
     """
     Constructs a score model from PMML in a String.
     """
     java_model = _jvm().org.pmml4s.spark.ScoreModel.fromString(s)
     return cls(java_model)
Example #26
0
 def fromFile(cls, name):
     """
     Constructs a score model from PMML file with given pathname.
     """
     java_model = _jvm().org.pmml4s.spark.ScoreModel.fromFile(name)
     return cls(java_model)
Example #27
0
 def _empty_java_param_map():
     """
     Returns an empty Java ParamMap reference.
     """
     return _jvm().org.apache.spark.ml.param.ParamMap()
Example #28
0
 def _load_java_obj(cls, java_class):
     """Load the peer Java object of the ML instance."""
     java_obj = _jvm()
     for name in java_class.split("."):
         java_obj = getattr(java_obj, name)
     return java_obj
Example #29
0
 def __init__(self):
     super(SimpleBinaryMetrics, self).__init__()
     self.BinaryMetrics = _jvm().ml.dhs.modelmonitor.BinaryMetrics
Example #30
0
 def convert(value):
     package = getattr(_jvm().ai.h2o.sparkling.ml.params, "H2OAlgoParamsHelper$")
     return package.__getattr__("MODULE$").getValidatedEnumValue(enumClass, TypeConverters.toString(value))
Example #31
0
 def _empty_java_param_map():
     """
     Returns an empty Java ParamMap reference.
     """
     return _jvm().org.apache.spark.ml.param.ParamMap()
Example #32
0
 def __init__(self):
     super(SimpleSparkSerializer, self).__init__()
     self._java_obj = _jvm().ml.combust.mleap.spark.SimpleSparkSerializer()