Beispiel #1
0
    def test(dataset: DataFrame, sampleCol: str, distName: str,
             *params: float) -> DataFrame:
        """
        Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution
        equality. Currently supports the normal distribution, taking as parameters the mean and
        standard deviation.

        .. versionadded:: 2.4.0

        Parameters
        ----------
        dataset : :py:class:`pyspark.sql.DataFrame`
            a Dataset or a DataFrame containing the sample of data to test.
        sampleCol : str
            Name of sample column in dataset, of any numerical type.
        distName : str
            a `string` name for a theoretical distribution, currently only support "norm".
        params : float
            a list of `float` values specifying the parameters to be used for the theoretical
            distribution. For "norm" distribution, the parameters includes mean and variance.

        Returns
        -------
        A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data.
        This DataFrame will contain a single Row with the following fields:

        - `pValue: Double`
        - `statistic: Double`

        Examples
        --------
        >>> from pyspark.ml.stat import KolmogorovSmirnovTest
        >>> dataset = [[-1.0], [0.0], [1.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        >>> dataset = [[2.0], [3.0], [4.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        """
        sc = SparkContext._active_spark_context
        assert sc is not None

        javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest
        dataset = _py2java(sc, dataset)
        params = [float(param) for param in params]  # type: ignore[assignment]
        return _java2py(
            sc,
            javaTestObj.test(dataset, sampleCol, distName,
                             _jvm().PythonUtils.toSeq(params)))
Beispiel #2
0
    def test(dataset, featuresCol, labelCol):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :return:
          DataFrame containing the test result for every feature against the label.
          This DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[Int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
        return _java2py(sc, javaTestObj.test(*args))
Beispiel #3
0
    def test(dataset, featuresCol, labelCol):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :return:
          DataFrame containing the test result for every feature against the label.
          This DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[Int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
        return _java2py(sc, javaTestObj.test(*args))
Beispiel #4
0
 def test(dataset, featuresCol, labelCol):
     """
     Perform a Pearson's independence test using dataset.
     """
     sc = SparkContext._active_spark_context
     javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
     args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
     return _java2py(sc, javaTestObj.test(*args))
Beispiel #5
0
 def corr(dataset, column, method="pearson"):
     """
     Compute the correlation matrix with specified method using dataset.
     """
     sc = SparkContext._active_spark_context
     javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
     args = [_py2java(sc, arg) for arg in (dataset, column, method)]
     return _java2py(sc, javaCorrObj.corr(*args))
Beispiel #6
0
 def test(dataset, featuresCol, labelCol):
     """
     Perform a Pearson's independence test using dataset.
     """
     sc = SparkContext._active_spark_context
     javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
     args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
     return _java2py(sc, javaTestObj.test(*args))
Beispiel #7
0
 def corr(dataset, column, method="pearson"):
     """
     Compute the correlation matrix with specified method using dataset.
     """
     sc = SparkContext._active_spark_context
     javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
     args = [_py2java(sc, arg) for arg in (dataset, column, method)]
     return _java2py(sc, javaCorrObj.corr(*args))
Beispiel #8
0
    def test(dataset, sampleCol, distName, *params):
        """
        Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution
        equality. Currently supports the normal distribution, taking as parameters the mean and
        standard deviation.

        :param dataset:
          a Dataset or a DataFrame containing the sample of data to test.
        :param sampleCol:
          Name of sample column in dataset, of any numerical type.
        :param distName:
          a `string` name for a theoretical distribution, currently only support "norm".
        :param params:
          a list of `Double` values specifying the parameters to be used for the theoretical
          distribution. For "norm" distribution, the parameters includes mean and variance.
        :return:
          A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data.
          This DataFrame will contain a single Row with the following fields:
          - `pValue: Double`
          - `statistic: Double`

        >>> from pyspark.ml.stat import KolmogorovSmirnovTest
        >>> dataset = [[-1.0], [0.0], [1.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        >>> dataset = [[2.0], [3.0], [4.0]]
        >>> dataset = spark.createDataFrame(dataset, ['sample'])
        >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first()
        >>> round(ksResult.pValue, 3)
        1.0
        >>> round(ksResult.statistic, 3)
        0.175
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest
        dataset = _py2java(sc, dataset)
        params = [float(param) for param in params]
        return _java2py(sc, javaTestObj.test(dataset, sampleCol, distName,
                                             _jvm().PythonUtils.toSeq(params)))
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform a F Regression test using dataset.

        :param dataset:
          DataFrame of continuous labels and continuous features.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `fValue: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `fValues: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import FValueTest
        >>> dataset = [[0.57495218, Vectors.dense([0.43486404, 0.57153633, 0.43175686,
        ...                                        0.51418671, 0.61632374, 0.96565515])],
        ...            [0.84619853, Vectors.dense([0.49162732, 0.6785187, 0.85460572,
        ...                                        0.59784822, 0.12394819, 0.53783355])],
        ...            [0.39777647, Vectors.dense([0.30879653, 0.54904515, 0.17103889,
        ...                                        0.40492506, 0.18957493, 0.5440016])],
        ...            [0.79201573, Vectors.dense([0.68114391, 0.60549825, 0.69094651,
        ...                                        0.62102109, 0.05471483, 0.96449167])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> fValueResult = FValueTest.test(dataset, 'features', 'label')
        >>> row = fValueResult.select("fValues", "pValues").collect()
        >>> row[0].fValues
        DenseVector([3.741, 7.5807, 142.0684, 34.9849, 0.4112, 0.0539])
        >>> row[0].pValues
        DenseVector([0.1928, 0.1105, 0.007, 0.0274, 0.5871, 0.838])
        >>> fValueResult = FValueTest.test(dataset, 'features', 'label', True)
        >>> row = fValueResult.orderBy("featureIndex").collect()
        >>> row[0].fValue
        3.7409548308350593
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.FValueTest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform an ANOVA test using dataset.

        :param dataset:
          DataFrame of categorical labels and continuous features.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `fValue: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `fValues: Vector`
          Each of these fields has one value per feature.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ANOVATest
        >>> dataset = [[2.0, Vectors.dense([0.43486404, 0.57153633, 0.43175686,
        ...                                 0.51418671, 0.61632374, 0.96565515])],
        ...            [1.0, Vectors.dense([0.49162732, 0.6785187, 0.85460572,
        ...                                 0.59784822, 0.12394819, 0.53783355])],
        ...            [2.0, Vectors.dense([0.30879653, 0.54904515, 0.17103889,
        ...                                 0.40492506, 0.18957493, 0.5440016])],
        ...            [3.0, Vectors.dense([0.68114391, 0.60549825, 0.69094651,
        ...                                 0.62102109, 0.05471483, 0.96449167])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> anovaResult = ANOVATest.test(dataset, 'features', 'label')
        >>> row = anovaResult.select("fValues", "pValues").collect()
        >>> row[0].fValues
        DenseVector([4.0264, 18.4713, 3.4659, 1.9042, 0.5532, 0.512])
        >>> row[0].pValues
        DenseVector([0.3324, 0.1623, 0.3551, 0.456, 0.689, 0.7029])
        >>> anovaResult = ANOVATest.test(dataset, 'features', 'label', True)
        >>> row = anovaResult.orderBy("featureIndex").collect()
        >>> row[0].fValue
        4.026438671875297
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ANOVATest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
Beispiel #11
0
    def corr(dataset: DataFrame,
             column: str,
             method: str = "pearson") -> DataFrame:
        """
        Compute the correlation matrix with specified method using dataset.

        .. versionadded:: 2.2.0

        Parameters
        ----------
        dataset : :py:class:`pyspark.sql.DataFrame`
            A DataFrame.
        column : str
            The name of the column of vectors for which the correlation coefficient needs
            to be computed. This must be a column of the dataset, and it must contain
            Vector objects.
        method : str, optional
            String specifying the method to use for computing correlation.
            Supported: `pearson` (default), `spearman`.

        Returns
        -------
        A DataFrame that contains the correlation matrix of the column of vectors. This
        DataFrame contains a single row and a single column of name `METHODNAME(COLUMN)`.

        Examples
        --------
        >>> from pyspark.ml.linalg import DenseMatrix, Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        assert sc is not None

        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
    def test(dataset, featuresCol, labelCol, flatten=False):
        """
        Perform a Pearson's independence test using dataset.

        :param dataset:
          DataFrame of categorical labels and categorical features.
          Real-valued features will be treated as categorical for each distinct value.
        :param featuresCol:
          Name of features column in dataset, of type `Vector` (`VectorUDT`).
        :param labelCol:
          Name of label column in dataset, of any numerical type.
        :param flatten: if True, flattens the returned dataframe.
        :return:
          DataFrame containing the test result for every feature against the label.
          If flatten is True, this DataFrame will contain one row per feature with the following
          fields:
          - `featureIndex: int`
          - `pValue: float`
          - `degreesOfFreedom: int`
          - `statistic: float`
          If flatten is False, this DataFrame will contain a single Row with the following fields:
          - `pValues: Vector`
          - `degreesOfFreedom: Array[int]`
          - `statistics: Vector`
          Each of these fields has one value per feature.

        .. versionchanged:: 3.1.0
           Added optional ``flatten`` argument.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import ChiSquareTest
        >>> dataset = [[0, Vectors.dense([0, 0, 1])],
        ...            [0, Vectors.dense([1, 0, 1])],
        ...            [1, Vectors.dense([2, 1, 1])],
        ...            [1, Vectors.dense([3, 1, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
        >>> chiSqResult.select("degreesOfFreedom").collect()[0]
        Row(degreesOfFreedom=[3, 1, 0])
        >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label', True)
        >>> row = chiSqResult.orderBy("featureIndex").collect()
        >>> row[0].statistic
        4.0
        """
        sc = SparkContext._active_spark_context
        javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest
        args = [
            _py2java(sc, arg)
            for arg in (dataset, featuresCol, labelCol, flatten)
        ]
        return _java2py(sc, javaTestObj.test(*args))
    def corr(dataset, column, method="pearson"):
        """
        Compute the correlation matrix with specified method using dataset.

        :param dataset:
          A Dataset or a DataFrame.
        :param column:
          The name of the column of vectors for which the correlation coefficient needs
          to be computed. This must be a column of the dataset, and it must contain
          Vector objects.
        :param method:
          String specifying the method to use for computing correlation.
          Supported: `pearson` (default), `spearman`.
        :return:
          A DataFrame that contains the correlation matrix of the column of vectors. This
          DataFrame contains a single row and a single column of name
          '$METHODNAME($COLUMN)'.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
Beispiel #14
0
    def corr(dataset, column, method="pearson"):
        """
        Compute the correlation matrix with specified method using dataset.

        :param dataset:
          A Dataset or a DataFrame.
        :param column:
          The name of the column of vectors for which the correlation coefficient needs
          to be computed. This must be a column of the dataset, and it must contain
          Vector objects.
        :param method:
          String specifying the method to use for computing correlation.
          Supported: `pearson` (default), `spearman`.
        :return:
          A DataFrame that contains the correlation matrix of the column of vectors. This
          DataFrame contains a single row and a single column of name
          '$METHODNAME($COLUMN)'.

        >>> from pyspark.ml.linalg import Vectors
        >>> from pyspark.ml.stat import Correlation
        >>> dataset = [[Vectors.dense([1, 0, 0, -2])],
        ...            [Vectors.dense([4, 5, 0, 3])],
        ...            [Vectors.dense([6, 7, 0, 8])],
        ...            [Vectors.dense([9, 0, 0, 1])]]
        >>> dataset = spark.createDataFrame(dataset, ['features'])
        >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0]
        >>> print(str(pearsonCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.0556...,         NaN,  0.4004...],
                     [ 0.0556...,  1.        ,         NaN,  0.9135...],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4004...,  0.9135...,         NaN,  1.        ]])
        >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0]
        >>> print(str(spearmanCorr).replace('nan', 'NaN'))
        DenseMatrix([[ 1.        ,  0.1054...,         NaN,  0.4       ],
                     [ 0.1054...,  1.        ,         NaN,  0.9486... ],
                     [        NaN,         NaN,  1.        ,         NaN],
                     [ 0.4       ,  0.9486... ,         NaN,  1.        ]])
        """
        sc = SparkContext._active_spark_context
        javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation
        args = [_py2java(sc, arg) for arg in (dataset, column, method)]
        return _java2py(sc, javaCorrObj.corr(*args))
Beispiel #15
0
 def _load_java_obj(cls, java_class):
     """Load the peer Java object of the ML instance."""
     java_obj = _jvm()
     for name in java_class.split("."):
         java_obj = getattr(java_obj, name)
     return java_obj
Beispiel #16
0
def dataframeToLocalFiles(df, localPath):
    javaMethod = _jvm().xuwch.sparkmpi.demo1.Util.dataframeToLocalFile
    jdf = javaMethod(df._jdf, localPath)
    return DataFrame(jdf, df.sql_ctx)