Example #1
0
    def _from_java(cls, JavaObject):
        sc = SparkContext._active_spark_context

        bucket = _java2py(sc, JavaObject.bucket())
        object_path = _java2py(sc, JavaObject.objectPath())

        return S3DataPath(bucket, object_path)
Example #2
0
 def _transfer_params_from_java(self):
     """
     Transforms the embedded com.microsoft.azure.synapse.ml.core.serialize.params from the companion Java object.
     """
     sc = SparkContext._active_spark_context
     for param in self.params:
         if self._java_obj.hasParam(param.name):
             java_param = self._java_obj.getParam(param.name)
             # SPARK-14931: Only check set com.microsoft.azure.synapse.ml.core.serialize.params back to avoid default com.microsoft.azure.synapse.ml.core.serialize.params mismatch.
             complex_param_class = sc._gateway.jvm.com.microsoft.azure.synapse.ml.core.serialize.ComplexParam._java_lang_class
             is_complex_param = complex_param_class.isAssignableFrom(
                 java_param.getClass())
             service_param_class = sc._gateway.jvm.org.apache.spark.ml.param.ServiceParam._java_lang_class
             is_service_param = service_param_class.isAssignableFrom(
                 java_param.getClass())
             if self._java_obj.isSet(java_param):
                 if is_complex_param:
                     value = self._java_obj.getOrDefault(java_param)
                 elif is_service_param:
                     jvObj = self._java_obj.getOrDefault(java_param)
                     if jvObj.isLeft():
                         value = _java2py(sc, jvObj.value())
                     else:
                         value = None
                 else:
                     value = _java2py(
                         sc, self._java_obj.getOrDefault(java_param))
                 self._set(**{param.name: value})
Example #3
0
 def meta_estimator_transfer_param_maps_from_java(pyEstimator, javaParamMaps):
     pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator)
     stagePairs = list(map(lambda stage: (stage, stage._to_java()), pyStages))
     sc = SparkContext._active_spark_context
     pyParamMaps = []
     for javaParamMap in javaParamMaps:
         pyParamMap = dict()
         for javaPair in javaParamMap.toList():
             javaParam = javaPair.param()
             pyParam = None
             for pyStage, javaStage in stagePairs:
                 if pyStage._testOwnParam(javaParam.parent(), javaParam.name()):
                     pyParam = pyStage.getParam(javaParam.name())
             if pyParam is None:
                 raise ValueError('Resolve param in estimatorParamMaps failed: ' +
                                  javaParam.parent() + '.' + javaParam.name())
             javaValue = javaPair.value()
             if sc._jvm.Class.forName("org.apache.spark.ml.util.DefaultParamsWritable") \
                     .isInstance(javaValue):
                 pyValue = JavaParams._from_java(javaValue)
             else:
                 pyValue = _java2py(sc, javaValue)
             pyParamMap[pyParam] = pyValue
         pyParamMaps.append(pyParamMap)
     return pyParamMaps
Example #4
0
def _call_java(sc, java_obj, name, *args):
    """
    Method copied from pyspark.ml.wrapper.  Uses private Spark APIs.
    """
    m = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return _java2py(sc, m(*java_args))
Example #5
0
    def _from_java(cls, java_stage):
        """
        Given a Java CrossValidatorModel, create and return a Python wrapper of it.
        Used for ML persistence.
        """
        sc = SparkContext._active_spark_context
        bestModel = JavaParams._from_java(java_stage.bestModel())
        avgMetrics = _java2py(sc, java_stage.avgMetrics())
        estimator, epms, evaluator = super(CrossValidatorModel,
                                           cls)._from_java_impl(java_stage)

        py_stage = cls(bestModel=bestModel, avgMetrics=avgMetrics)
        params = {
            "evaluator": evaluator,
            "estimator": estimator,
            "estimatorParamMaps": epms,
            "numFolds": java_stage.getNumFolds(),
            "foldCol": java_stage.getFoldCol(),
            "seed": java_stage.getSeed(),
        }
        for param_name, param_val in params.items():
            py_stage = py_stage._set(**{param_name: param_val})

        if java_stage.hasSubModels():
            py_stage.subModels = [[
                JavaParams._from_java(sub_model)
                for sub_model in fold_sub_models
            ] for fold_sub_models in java_stage.subModels()]

        py_stage._resetUid(java_stage.uid())
        return py_stage
Example #6
0
def _call_java(sc, java_obj, name, *args):
    """
    Method copied from pyspark.ml.wrapper.  Uses private Spark APIs.
    """
    m = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return _java2py(sc, m(*java_args))
Example #7
0
 def meta_estimator_transfer_param_maps_from_java(pyEstimator,
                                                  javaParamMaps):
     pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator)
     stagePairs = list(
         map(lambda stage: (stage, stage._to_java()), pyStages))
     sc = SparkContext._active_spark_context
     pyParamMaps = []
     for javaParamMap in javaParamMaps:
         pyParamMap = dict()
         for javaPair in javaParamMap.toList():
             javaParam = javaPair.param()
             pyParam = None
             for pyStage, javaStage in stagePairs:
                 if pyStage._testOwnParam(javaParam.parent(),
                                          javaParam.name()):
                     pyParam = pyStage.getParam(javaParam.name())
             if pyParam is None:
                 raise ValueError(
                     'Resolve param in estimatorParamMaps failed: ' +
                     javaParam.parent() + '.' + javaParam.name())
             javaValue = javaPair.value()
             if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage"
                                      ).isInstance(javaValue):
                 # Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class
                 # and Estimator/Transformer class which implements `_from_java` static method
                 # (such as OneVsRest, Pipeline class).
                 pyValue = JavaParams._from_java(javaValue)
             else:
                 pyValue = _java2py(sc, javaValue)
             pyParamMap[pyParam] = pyValue
         pyParamMaps.append(pyParamMap)
     return pyParamMaps
Example #8
0
    def test(dataset, featuresCol, labelCol):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :return:
          DataFrame containing the test result for every feature against the label.
          This DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[Int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
        return _java2py(sc, javaTestObj.test(*args))
Example #9
0
 def _transfer_params_from_java(self):
     """
     Transforms the embedded params from the companion Java object.
     """
     sc = SparkContext._active_spark_context
     for param in self.params:
         if self._java_obj.hasParam(param.name):
             java_param = self._java_obj.getParam(param.name)
             # SPARK-14931: Only check set params back to avoid default params mismatch.
             if self._java_obj.isSet(java_param):
                 value = _java2py(sc, self._java_obj.getOrDefault(java_param))
                 self._set(**{param.name: value})
             # SPARK-10931: Temporary fix for params that have a default in Java
             if self._java_obj.hasDefault(java_param) and not self.isDefined(param):
                 value = _java2py(sc, self._java_obj.getDefault(java_param)).get()
                 self._setDefault(**{param.name: value})
Example #10
0
File: tuning.py Project: zzl0/spark
    def _from_java(cls, java_stage):
        """
        Given a Java TrainValidationSplitModel, create and return a Python wrapper of it.
        Used for ML persistence.
        """

        # Load information from java_stage to the instance.
        sc = SparkContext._active_spark_context
        bestModel = JavaParams._from_java(java_stage.bestModel())
        validationMetrics = _java2py(sc, java_stage.validationMetrics())
        estimator, epms, evaluator = super(TrainValidationSplitModel,
                                           cls)._from_java_impl(java_stage)
        # Create a new instance of this stage.
        py_stage = cls(
            bestModel=bestModel,
            validationMetrics=validationMetrics)._set(estimator=estimator)
        py_stage = py_stage._set(estimatorParamMaps=epms)._set(
            evaluator=evaluator)

        if java_stage.hasSubModels():
            py_stage.subModels = [
                JavaParams._from_java(sub_model)
                for sub_model in java_stage.subModels()
            ]

        py_stage._resetUid(java_stage.uid())
        return py_stage
Example #11
0
 def _transfer_params_from_java(self):
     """
     Transforms the embedded params from the companion Java object.
     """
     sc = SparkContext._active_spark_context
     for param in self.params:
         if self._java_obj.hasParam(param.name):
             java_param = self._java_obj.getParam(param.name)
             # SPARK-14931: Only check set params back to avoid default params mismatch.
             if self._java_obj.isSet(java_param):
                 value = _java2py(sc, self._java_obj.getOrDefault(java_param))
                 self._set(**{param.name: value})
             # SPARK-10931: Temporary fix for params that have a default in Java
             if self._java_obj.hasDefault(java_param) and not self.isDefined(param):
                 value = _java2py(sc, self._java_obj.getDefault(java_param)).get()
                 self._setDefault(**{param.name: value})
Example #12
0
    def _from_java(cls, java_stage):
        """
        Given a Java TrainValidationSplitModel, create and return a Python wrapper of it.
        Used for ML persistence.
        """

        # Load information from java_stage to the instance.
        sc = SparkContext._active_spark_context
        bestModel = JavaParams._from_java(java_stage.bestModel())
        validationMetrics = _java2py(sc, java_stage.validationMetrics())
        estimator, epms, evaluator = super(TrainValidationSplitModel,
                                           cls)._from_java_impl(java_stage)
        # Create a new instance of this stage.
        py_stage = cls(bestModel=bestModel,
                       validationMetrics=validationMetrics)
        params = {
            "evaluator": evaluator,
            "estimator": estimator,
            "estimatorParamMaps": epms,
            "trainRatio": java_stage.getTrainRatio(),
            "seed": java_stage.getSeed(),
        }
        for param_name, param_val in params.items():
            py_stage = py_stage._set(**{param_name: param_val})

        if java_stage.hasSubModels():
            py_stage.subModels = [
                JavaParams._from_java(sub_model)
                for sub_model in java_stage.subModels()
            ]

        py_stage._resetUid(java_stage.uid())
        return py_stage
Example #13
0
    def _call_java(self, name: str, *args: Any) -> Any:
        m = getattr(self._java_obj, name)
        sc = SparkContext._active_spark_context
        assert sc is not None

        java_args = [_py2java(sc, arg) for arg in args]
        return _java2py(sc, m(*java_args))
Example #14
0
    def test(dataset, featuresCol, labelCol):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :return:
          DataFrame containing the test result for every feature against the label.
          This DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[Int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
        return _java2py(sc, javaTestObj.test(*args))
Example #15
0
File: stat.py Project: LY3918/spark
 def corr(dataset, column, method="pearson"):
     """
     Compute the correlation matrix with specified method using dataset.
     """
     sc = SparkContext._active_spark_context
     javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
     args = [_py2java(sc, arg) for arg in (dataset, column, method)]
     return _java2py(sc, javaCorrObj.corr(*args))
Example #16
0
 def test(dataset, featuresCol, labelCol):
     """
     Perform a Pearson's independence test using dataset.
     """
     sc = SparkContext._active_spark_context
     javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
     args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
     return _java2py(sc, javaTestObj.test(*args))
Example #17
0
 def corr(dataset, column, method="pearson"):
     """
     Compute the correlation matrix with specified method using dataset.
     """
     sc = SparkContext._active_spark_context
     javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
     args = [_py2java(sc, arg) for arg in (dataset, column, method)]
     return _java2py(sc, javaCorrObj.corr(*args))
Example #18
0
File: stat.py Project: LY3918/spark
 def test(dataset, featuresCol, labelCol):
     """
     Perform a Pearson's independence test using dataset.
     """
     sc = SparkContext._active_spark_context
     javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
     args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
     return _java2py(sc, javaTestObj.test(*args))
Example #19
0
    def test(dataset: DataFrame, sampleCol: str, distName: str,
             *params: float) -> DataFrame:
        """
        Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution
        equality. Currently supports the normal distribution, taking as parameters the mean and
        standard deviation.

        .. versionadded:: 2.4.0

        Parameters
        ----------
        dataset : :py:class:`pyspark.sql.DataFrame`
            a Dataset or a DataFrame containing the sample of data to test.
        sampleCol : str
            Name of sample column in dataset, of any numerical type.
        distName : str
            a `string` name for a theoretical distribution, currently only support "norm".
        params : float
            a list of `float` values specifying the parameters to be used for the theoretical
            distribution. For "norm" distribution, the parameters includes mean and variance.

        Returns
        -------
        A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data.
        This DataFrame will contain a single Row with the following fields:

        - `pValue: Double`
        - `statistic: Double`

        Examples
        --------
        >>> from pyspark.ml.stat import KolmogorovSmirnovTest
        >>> dataset = [[-1.0], [0.0], [1.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        >>> dataset = [[2.0], [3.0], [4.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        """
        sc = SparkContext._active_spark_context
        assert sc is not None

        javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest
        dataset = _py2java(sc, dataset)
        params = [float(param) for param in params]  # type: ignore[assignment]
        return _java2py(
            sc,
            javaTestObj.test(dataset, sampleCol, distName,
                             _jvm().PythonUtils.toSeq(params)))
Example #20
0
 def _transfer_param_map_from_java(self, javaParamMap):
     """
     Transforms a Java ParamMap into a Python ParamMap.
     """
     sc = SparkContext._active_spark_context
     paramMap = dict()
     for pair in javaParamMap.toList():
         param = pair.param()
         if self.hasParam(str(param.name())):
             paramMap[self.getParam(param.name())] = _java2py(sc, pair.value())
     return paramMap
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform a F Regression test using dataset.

        :param dataset:
          DataFrame of continuous labels and continuous features.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `fValue: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `fValues: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import FValueTest
        >>> dataset = [[0.57495218, Vectors.dense([0.43486404, 0.57153633, 0.43175686,
        ...                                        0.51418671, 0.61632374, 0.96565515])],
        ...            [0.84619853, Vectors.dense([0.49162732, 0.6785187, 0.85460572,
        ...                                        0.59784822, 0.12394819, 0.53783355])],
        ...            [0.39777647, Vectors.dense([0.30879653, 0.54904515, 0.17103889,
        ...                                        0.40492506, 0.18957493, 0.5440016])],
        ...            [0.79201573, Vectors.dense([0.68114391, 0.60549825, 0.69094651,
        ...                                        0.62102109, 0.05471483, 0.96449167])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> fValueResult = FValueTest.test(dataset, 'features', 'label')
        >>> row = fValueResult.select("fValues", "pValues").collect()
        >>> row[0].fValues
        DenseVector([3.741, 7.5807, 142.0684, 34.9849, 0.4112, 0.0539])
        >>> row[0].pValues
        DenseVector([0.1928, 0.1105, 0.007, 0.0274, 0.5871, 0.838])
        >>> fValueResult = FValueTest.test(dataset, 'features', 'label', True)
        >>> row = fValueResult.orderBy("featureIndex").collect()
        >>> row[0].fValue
        3.7409548308350593
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.FValueTest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
Example #22
0
 def _transfer_param_map_from_java(self, javaParamMap):
     """
     Transforms a Java ParamMap into a Python ParamMap.
     """
     sc = SparkContext._active_spark_context
     paramMap = dict()
     for pair in javaParamMap.toList():
         param = pair.param()
         if self.hasParam(str(param.name())):
             paramMap[self.getParam(param.name())] = _java2py(sc, pair.value())
     return paramMap
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform an ANOVA test using dataset.

        :param dataset:
          DataFrame of categorical labels and continuous features.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `fValue: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `fValues: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ANOVATest
        >>> dataset = [[2.0, Vectors.dense([0.43486404, 0.57153633, 0.43175686,
        ...                                 0.51418671, 0.61632374, 0.96565515])],
        ...            [1.0, Vectors.dense([0.49162732, 0.6785187, 0.85460572,
        ...                                 0.59784822, 0.12394819, 0.53783355])],
        ...            [2.0, Vectors.dense([0.30879653, 0.54904515, 0.17103889,
        ...                                 0.40492506, 0.18957493, 0.5440016])],
        ...            [3.0, Vectors.dense([0.68114391, 0.60549825, 0.69094651,
        ...                                 0.62102109, 0.05471483, 0.96449167])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> anovaResult = ANOVATest.test(dataset, 'features', 'label')
        >>> row = anovaResult.select("fValues", "pValues").collect()
        >>> row[0].fValues
        DenseVector([4.0264, 18.4713, 3.4659, 1.9042, 0.5532, 0.512])
        >>> row[0].pValues
        DenseVector([0.3324, 0.1623, 0.3551, 0.456, 0.689, 0.7029])
        >>> anovaResult = ANOVATest.test(dataset, 'features', 'label', True)
        >>> row = anovaResult.orderBy("featureIndex").collect()
        >>> row[0].fValue
        4.026438671875297
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ANOVATest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
Example #24
0
    def corr(dataset: DataFrame,
             column: str,
             method: str = "pearson") -> DataFrame:
        """
        Compute the correlation matrix with specified method using dataset.

        .. versionadded:: 2.2.0

        Parameters
        ----------
        dataset : :py:class:`pyspark.sql.DataFrame`
            A DataFrame.
        column : str
            The name of the column of vectors for which the correlation coefficient needs
            to be computed. This must be a column of the dataset, and it must contain
            Vector objects.
        method : str, optional
            String specifying the method to use for computing correlation.
            Supported: `pearson` (default), `spearman`.

        Returns
        -------
        A DataFrame that contains the correlation matrix of the column of vectors. This
        DataFrame contains a single row and a single column of name `METHODNAME(COLUMN)`.

        Examples
        --------
        >>> from pyspark.ml.linalg import DenseMatrix, Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        assert sc is not None

        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `statistic: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        .. versionchanged:: 3.1.0
           Added optional ``flatten`` argument.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label', True)
        >>> row = chiSqResult.orderBy("featureIndex").collect()
        >>> row[0].statistic
        4.0
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
Example #26
0
    def _transfer_params_from_java(self) -> None:
        """
        Transforms the embedded params from the companion Java object.
        """
        sc = SparkContext._active_spark_context
        assert sc is not None and self._java_obj is not None

        for param in self.params:
            if self._java_obj.hasParam(param.name):
                java_param = self._java_obj.getParam(param.name)
                # SPARK-14931: Only check set params back to avoid default params mismatch.
                if self._java_obj.isSet(java_param):
                    java_value = self._java_obj.getOrDefault(java_param)
                    if param.typeConverter.__name__.startswith("toList"):
                        value = [_java2py(sc, x) for x in list(java_value)]
                    else:
                        value = _java2py(sc, java_value)
                    self._set(**{param.name: value})
                # SPARK-10931: Temporary fix for params that have a default in Java
                if self._java_obj.hasDefault(
                        java_param) and not self.isDefined(param):
                    value = _java2py(
                        sc, self._java_obj.getDefault(java_param)).get()
                    self._setDefault(**{param.name: value})
Example #27
0
    def test(dataset, sampleCol, distName, *params):
        """
        Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution
        equality. Currently supports the normal distribution, taking as parameters the mean and
        standard deviation.

        :param dataset:
          a Dataset or a DataFrame containing the sample of data to test.
        :param sampleCol:
          Name of sample column in dataset, of any numerical type.
        :param distName:
          a `string` name for a theoretical distribution, currently only support "norm".
        :param params:
          a list of `Double` values specifying the parameters to be used for the theoretical
          distribution. For "norm" distribution, the parameters includes mean and variance.
        :return:
          A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data.
          This DataFrame will contain a single Row with the following fields:
          - `pValue: Double`
          - `statistic: Double`

        >>> from pyspark.ml.stat import KolmogorovSmirnovTest
        >>> dataset = [[-1.0], [0.0], [1.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        >>> dataset = [[2.0], [3.0], [4.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest
        dataset = _py2java(sc, dataset)
        params = [float(param) for param in params]
        return _java2py(sc, javaTestObj.test(dataset, sampleCol, distName,
                                             _jvm().PythonUtils.toSeq(params)))
 def _transfer_param_map_from_java(self, javaParamMap):
     """
     Transforms a Java ParamMap into a Python ParamMap.
     """
     sc = SparkContext._active_spark_context
     paramMap = dict()
     for pair in javaParamMap.toList():
         param = pair.param()
         if self.hasParam(str(param.name())):
             java_obj = pair.value()
             if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage").isInstance(java_obj):
                 # Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class
                 # and Estimator/Transformer class which implements `_from_java` static method
                 # (such as OneVsRest, Pipeline class).
                 py_obj = JavaParams._from_java(java_obj)
             else:
                 py_obj = _java2py(sc, java_obj)
             paramMap[self.getParam(param.name())] = py_obj
     return paramMap
Example #29
0
 def _transfer_params_from_java(self):
     """
     Transforms the embedded params from the companion Java object.
     """
     sc = SparkContext._active_spark_context
     for param in self.params:
         if self._java_obj.hasParam(param.name):
             java_param = self._java_obj.getParam(param.name)
             # SPARK-14931: Only check set params back to avoid default params mismatch.
             complex_param_class = sc._gateway.jvm.org.apache.spark.ml.param.ComplexParam._java_lang_class
             is_complex_param = complex_param_class.isAssignableFrom(
                 java_param.getClass())
             if self._java_obj.isSet(java_param):
                 if is_complex_param:
                     value = self._java_obj.getOrDefault(java_param)
                 else:
                     value = _java2py(
                         sc, self._java_obj.getOrDefault(java_param))
                 self._set(**{param.name: value})
    def corr(dataset, column, method="pearson"):
        """
        Compute the correlation matrix with specified method using dataset.

        :param dataset:
          A Dataset or a DataFrame.
        :param column:
          The name of the column of vectors for which the correlation coefficient needs
          to be computed. This must be a column of the dataset, and it must contain
          Vector objects.
        :param method:
          String specifying the method to use for computing correlation.
          Supported: `pearson` (default), `spearman`.
        :return:
          A DataFrame that contains the correlation matrix of the column of vectors. This
          DataFrame contains a single row and a single column of name
          '$METHODNAME($COLUMN)'.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
Example #31
0
    def corr(dataset, column, method="pearson"):
        """
        Compute the correlation matrix with specified method using dataset.

        :param dataset:
          A Dataset or a DataFrame.
        :param column:
          The name of the column of vectors for which the correlation coefficient needs
          to be computed. This must be a column of the dataset, and it must contain
          Vector objects.
        :param method:
          String specifying the method to use for computing correlation.
          Supported: `pearson` (default), `spearman`.
        :return:
          A DataFrame that contains the correlation matrix of the column of vectors. This
          DataFrame contains a single row and a single column of name
          '$METHODNAME($COLUMN)'.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
Example #32
0
    def _from_java(cls, java_stage):
        """
        Given a Java CrossValidatorModel, create and return a Python wrapper of it.
        Used for ML persistence.
        """
        sc = SparkContext._active_spark_context
        bestModel = JavaParams._from_java(java_stage.bestModel())
        avgMetrics = _java2py(sc, java_stage.avgMetrics())
        estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage)

        py_stage = cls(bestModel=bestModel, avgMetrics=avgMetrics).setEstimator(estimator)
        py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator)

        if java_stage.hasSubModels():
            py_stage.subModels = [[JavaParams._from_java(sub_model)
                                   for sub_model in fold_sub_models]
                                  for fold_sub_models in java_stage.subModels()]

        py_stage._resetUid(java_stage.uid())
        return py_stage
Example #33
0
    def _from_java(cls, java_object):

        # primitives and spark data types are converted automatically by
        # _java2py(), in those cases there is nothing to do
        if type(java_object) != py4j.java_gateway.JavaObject:
            return java_object

        # construct a mapping of our python wrapped classes to
        # java/scala classes
        wrapped_classes = {}
        for cls in SageMakerJavaWrapper.__subclasses__():
            wrapped_classes[cls._wrapped_class] = cls

        class_name = java_object.getClass().getName()

        # SageMakerJavaWrapper classes know how to convert themselves from a Java Object
        # otherwise hand over to _java2py and hope for the best.
        if class_name in wrapped_classes:
            return wrapped_classes[class_name]._from_java(java_object)
        elif class_name.startswith("scala.None"):
            return None
        else:
            sc = SparkContext._active_spark_context
            return _java2py(sc, java_object)
Example #34
0
from pyspark.sql import SparkSession
from pyspark.ml.common import _java2py
from Chapter01.utilities01_py.helper_python import create_session

#  ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit --driver-class-path ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Exercise4_06/Exercise4_06.py
if __name__ == "__main__":
    session: SparkSession = create_session(2, "PySpark <> JVM")
    session.sparkContext.setLogLevel('ERROR')
    python_rdd = session.sparkContext.range(0, 5)

    java_rdd = session.sparkContext._jvm.SerDe.pythonToJava(
        python_rdd._jrdd, True)
    mapped_java_rdd = session.sparkContext._jvm.Exercise4_06.ScalaObject.executeInScala(
        java_rdd)
    mapped_python_rdd = _java2py(session.sparkContext, mapped_java_rdd)
    print(mapped_python_rdd.collect())
Example #35
0
 def _call_java(self, name, *args):
     m = getattr(self._java_obj, name)
     sc = SparkContext._active_spark_context
     java_args = [_py2java(sc, arg) for arg in args]
     return _java2py(sc, m(*java_args))
Example #36
0
 def _call_java(self, name, *args):
     m = getattr(self._java_obj, name)
     sc = SparkContext._active_spark_context
     java_args = [_py2java(sc, arg) for arg in args]
     return _java2py(sc, m(*java_args))
from sys import argv
from pyspark.ml.common import _java2py
from Chapter01.utilities01_py.helper_python import create_session
from Chapter02.utilities02_py.helper_python import sample_warc_loc, extract_raw_records, parse_raw_warc

#  ~/spark-2.4.6-bin-hadoop2.7/bin/spark-submit  --driver-class-path  ~/IdeaProjects/The-Spark-Workshop/target/packt-uber-jar.jar:/Users/a/.m2/repository/com/google/guava/guava/28.2-jre/guava-28.2-jre.jar:/Users/a/.m2/repository/org/apache/commons/commons-compress/1.20/commons-compress-1.20.jar ~/IdeaProjects/The-Spark-Workshop/Chapter04/Activity4_03/Activity4_03.py ~/Output_Act4_3
if __name__ == "__main__":
    output_dir = argv[1]
    session = create_session(3, 'WARC Parser')

    warc_records = extract_raw_records(sample_warc_loc, session) \
        .flatMap(lambda record: parse_raw_warc(record)) \
        .filter(lambda record: record.warc_type == 'response')

    plaintexts_rdd = warc_records.map(lambda record: record.html_source)
    java_rdd = session.sparkContext._jvm.SerDe.pythonToJava(
        plaintexts_rdd._jrdd, True)
    tagged_java_rdd = session.sparkContext._jvm.Activity4_03.Activity4_03.tagJavaRDD(
        java_rdd)
    tagged_python_rdd = _java2py(session.sparkContext, tagged_java_rdd)

    tagged_python_rdd.saveAsTextFile(output_dir)
Example #38
0
    def feature_aggregated_shap(self, input_cols):

        return _java2py(self._sc,
                        self._shapley_model.getShapValuesFromModel(input_cols))
Example #39
0
    def calculate(self):

        return _java2py(self._sc, self._shapley_model.calculate())