def putOption(self, pipelineStage, key, value): javaKey = _py2java(self.sc, key) javaValue = _py2java(self.sc, value) if pipelineStage is None: self.javaPmmlBuilder.putOption(javaKey, javaValue) else: javaPipelineStage = pipelineStage._to_java() self.javaPmmlBuilder.putOption(javaPipelineStage, javaKey, javaValue) return self
def _call_java(sc, java_obj, name, *args): """ Method copied from pyspark.ml.wrapper. Uses private Spark APIs. """ m = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))
def test(dataset, featuresCol, labelCol): """ Perform a Pearson's independence test using dataset. :param dataset: DataFrame of categorical labels and categorical features. Real-valued features will be treated as categorical for each distinct value. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :return: DataFrame containing the test result for every feature against the label. This DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[Int]` - `statistics: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ChiSquareTest >>> dataset = [[0, Vectors.dense([0, 0, 1])], ... [0, Vectors.dense([1, 0, 1])], ... [1, Vectors.dense([2, 1, 1])], ... [1, Vectors.dense([3, 1, 1])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label') >>> chiSqResult.select("degreesOfFreedom").collect()[0] Row(degreesOfFreedom=[3, 1, 0]) """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] return _java2py(sc, javaTestObj.test(*args))
def _call_java(self, name: str, *args: Any) -> Any: m = getattr(self._java_obj, name) sc = SparkContext._active_spark_context assert sc is not None java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context # TODO: persst validation metrics as well _java_obj = JavaParams._new_java_obj( "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) if self.subModels is not None: java_sub_models = [ sub_model._to_java() for sub_model in self.subModels ] _java_obj.setSubModels(java_sub_models) return _java_obj
def to_java_params(sc, model, pyParamMap): paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap") for param, value in pyParamMap.items(): java_param = model._java_obj.getParam(param.name) java_value = _py2java(sc, value) paramMap.put([java_param.w(java_value)]) return paramMap
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context _java_obj = JavaParams._new_java_obj( "org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), _py2java(sc, self.avgMetrics)) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() params = { "evaluator": evaluator, "estimator": estimator, "estimatorParamMaps": epms, "numFolds": self.getNumFolds(), "foldCol": self.getFoldCol(), "seed": self.getSeed(), } for param_name, param_val in params.items(): java_param = _java_obj.getParam(param_name) pair = java_param.w(param_val) _java_obj.set(pair) if self.subModels is not None: java_sub_models = [[ sub_model._to_java() for sub_model in fold_sub_models ] for fold_sub_models in self.subModels] _java_obj.setSubModels(java_sub_models) return _java_obj
def meta_estimator_transfer_param_maps_to_java(pyEstimator, pyParamMaps): pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator) stagePairs = list( map(lambda stage: (stage, stage._to_java()), pyStages)) sc = SparkContext._active_spark_context paramMapCls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap javaParamMaps = SparkContext._gateway.new_array( paramMapCls, len(pyParamMaps)) for idx, pyParamMap in enumerate(pyParamMaps): javaParamMap = JavaWrapper._new_java_obj( "org.apache.spark.ml.param.ParamMap") for pyParam, pyValue in pyParamMap.items(): javaParam = None for pyStage, javaStage in stagePairs: if pyStage._testOwnParam(pyParam.parent, pyParam.name): javaParam = javaStage.getParam(pyParam.name) break if javaParam is None: raise ValueError( 'Resolve param in estimatorParamMaps failed: ' + str(pyParam)) if isinstance(pyValue, Params) and hasattr( pyValue, "_to_java"): javaValue = pyValue._to_java() else: javaValue = _py2java(sc, pyValue) pair = javaParam.w(javaValue) javaParamMap.put([pair]) javaParamMaps[idx] = javaParamMap return javaParamMaps
def bundle(spark_session, spark_df_schema, spark_pipeline_model): #spark_df_as_java = _py2java(spark_session, spark_df) #spark_df_schema_as_java = spark_df_as_java.schema.__call__() spark_df_schema_as_json = spark_df_schema.json() with open('model.schema', 'wb') as pkl_file: pickle.dump(spark_df_schema_as_json, pkl_file) spark_pipeline_model.write().overwrite().save('model.parquet') ## SERVE FROM HERE with open('model.schema', 'rb') as pkl_file: from pyspark.sql.types import _parse_datatype_json_string restored_spark_df_schema_as_json = pickle.load(pkl_file) restored_spark_df_schema = _parse_datatype_json_string( restored_spark_df_schema_as_json) restored_spark_df_schema_as_java = _py2java(spark_session, restored_spark_df_schema) restored_spark_pipeline_model = PipelineModel.read().load('model.parquet') restored_spark_pipeline_model_as_java = restored_spark_pipeline_model._to_java( ) return spark_session._jvm.org.jpmml.sparkml.ConverterUtil.toPMMLByteArray( restored_spark_df_schema_as_java, restored_spark_pipeline_model_as_java)
def corr(dataset, column, method="pearson"): """ Compute the correlation matrix with specified method using dataset. """ sc = SparkContext._active_spark_context javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def __init__(self, spark_df, spark_session=None): if spark_session is None: spark_session = SparkSession.builder.getOrCreate() super().__init__( _py2java(spark_session.sparkContext, spark_df), SQLContext(spark_session.sparkContext, spark_session), ) self._validate()
def test(dataset, featuresCol, labelCol): """ Perform a Pearson's independence test using dataset. """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] return _java2py(sc, javaTestObj.test(*args))
def _new_java_obj(sc, java_class, *args): """ Construct a new Java object. """ java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return java_obj(*java_args)
def _make_java_param_pair(self, param, value): """ Makes a Java param pair. """ sc = SparkContext._active_spark_context param = self._resolveParam(param) java_param = self._java_obj.getParam(param.name) java_value = _py2java(sc, value) return java_param.w(java_value)
def test(dataset: DataFrame, sampleCol: str, distName: str, *params: float) -> DataFrame: """ Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution equality. Currently supports the normal distribution, taking as parameters the mean and standard deviation. .. versionadded:: 2.4.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` a Dataset or a DataFrame containing the sample of data to test. sampleCol : str Name of sample column in dataset, of any numerical type. distName : str a `string` name for a theoretical distribution, currently only support "norm". params : float a list of `float` values specifying the parameters to be used for the theoretical distribution. For "norm" distribution, the parameters includes mean and variance. Returns ------- A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data. This DataFrame will contain a single Row with the following fields: - `pValue: Double` - `statistic: Double` Examples -------- >>> from pyspark.ml.stat import KolmogorovSmirnovTest >>> dataset = [[-1.0], [0.0], [1.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 >>> dataset = [[2.0], [3.0], [4.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 """ sc = SparkContext._active_spark_context assert sc is not None javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest dataset = _py2java(sc, dataset) params = [float(param) for param in params] # type: ignore[assignment] return _java2py( sc, javaTestObj.test(dataset, sampleCol, distName, _jvm().PythonUtils.toSeq(params)))
def _make_java_param_pair(self, param, value): """ Makes a Java parm pair. """ sc = SparkContext._active_spark_context param = self._resolveParam(param) java_param = self._java_obj.getParam(param.name) java_value = _py2java(sc, value) return java_param.w(java_value)
def __init__(self, sc, df, pipelineModel): javaDf = _py2java(sc, df) javaSchema = javaDf.schema.__call__() javaPipelineModel = pipelineModel._to_java() javaPmmlBuilder = sc._jvm.org.jpmml.sparkml.PMMLBuilder(javaSchema, javaPipelineModel) if(not isinstance(javaPmmlBuilder, JavaObject)): raise RuntimeError("JPMML-SparkML not found on classpath") self.sc = sc self.javaPmmlBuilder = javaPmmlBuilder
def _new_java_obj(java_class, *args): """ Returns a new Java object. """ sc = SparkContext._active_spark_context java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return java_obj(*java_args)
def toPMMLBytes(sc, df, pipelineModel): javaDF = _py2java(sc, df) javaSchema = javaDF.schema.__call__() javaPipelineModel = pipelineModel._to_java() javaConverter = sc._jvm.org.jpmml.sparkml.ConverterUtil if (not isinstance(javaConverter, JavaClass)): raise RuntimeError("JPMML-SparkML not found on classpath") return javaConverter.toPMMLByteArray(javaSchema, javaPipelineModel)
def _make_java_param_pair(self, param: Param[T], value: T) -> "JavaObject": """ Makes a Java param pair. """ sc = SparkContext._active_spark_context assert sc is not None and self._java_obj is not None param = self._resolveParam(param) java_param = self._java_obj.getParam(param.name) java_value = _py2java(sc, value) return java_param.w(java_value)
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform a F Regression test using dataset. :param dataset: DataFrame of continuous labels and continuous features. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `fValue: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `fValues: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import FValueTest >>> dataset = [[0.57495218, Vectors.dense([0.43486404, 0.57153633, 0.43175686, ... 0.51418671, 0.61632374, 0.96565515])], ... [0.84619853, Vectors.dense([0.49162732, 0.6785187, 0.85460572, ... 0.59784822, 0.12394819, 0.53783355])], ... [0.39777647, Vectors.dense([0.30879653, 0.54904515, 0.17103889, ... 0.40492506, 0.18957493, 0.5440016])], ... [0.79201573, Vectors.dense([0.68114391, 0.60549825, 0.69094651, ... 0.62102109, 0.05471483, 0.96449167])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> fValueResult = FValueTest.test(dataset, 'features', 'label') >>> row = fValueResult.select("fValues", "pValues").collect() >>> row[0].fValues DenseVector([3.741, 7.5807, 142.0684, 34.9849, 0.4112, 0.0539]) >>> row[0].pValues DenseVector([0.1928, 0.1105, 0.007, 0.0274, 0.5871, 0.838]) >>> fValueResult = FValueTest.test(dataset, 'features', 'label', True) >>> row = fValueResult.orderBy("featureIndex").collect() >>> row[0].fValue 3.7409548308350593 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.FValueTest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform an ANOVA test using dataset. :param dataset: DataFrame of categorical labels and continuous features. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `fValue: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `fValues: Vector` Each of these fields has one value per feature. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ANOVATest >>> dataset = [[2.0, Vectors.dense([0.43486404, 0.57153633, 0.43175686, ... 0.51418671, 0.61632374, 0.96565515])], ... [1.0, Vectors.dense([0.49162732, 0.6785187, 0.85460572, ... 0.59784822, 0.12394819, 0.53783355])], ... [2.0, Vectors.dense([0.30879653, 0.54904515, 0.17103889, ... 0.40492506, 0.18957493, 0.5440016])], ... [3.0, Vectors.dense([0.68114391, 0.60549825, 0.69094651, ... 0.62102109, 0.05471483, 0.96449167])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> anovaResult = ANOVATest.test(dataset, 'features', 'label') >>> row = anovaResult.select("fValues", "pValues").collect() >>> row[0].fValues DenseVector([4.0264, 18.4713, 3.4659, 1.9042, 0.5532, 0.512]) >>> row[0].pValues DenseVector([0.3324, 0.1623, 0.3551, 0.456, 0.689, 0.7029]) >>> anovaResult = ANOVATest.test(dataset, 'features', 'label', True) >>> row = anovaResult.orderBy("featureIndex").collect() >>> row[0].fValue 4.026438671875297 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ANOVATest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def corr(dataset: DataFrame, column: str, method: str = "pearson") -> DataFrame: """ Compute the correlation matrix with specified method using dataset. .. versionadded:: 2.2.0 Parameters ---------- dataset : :py:class:`pyspark.sql.DataFrame` A DataFrame. column : str The name of the column of vectors for which the correlation coefficient needs to be computed. This must be a column of the dataset, and it must contain Vector objects. method : str, optional String specifying the method to use for computing correlation. Supported: `pearson` (default), `spearman`. Returns ------- A DataFrame that contains the correlation matrix of the column of vectors. This DataFrame contains a single row and a single column of name `METHODNAME(COLUMN)`. Examples -------- >>> from pyspark.ml.linalg import DenseMatrix, Vectors >>> from pyspark.ml.stat import Correlation >>> dataset = [[Vectors.dense([1, 0, 0, -2])], ... [Vectors.dense([4, 5, 0, 3])], ... [Vectors.dense([6, 7, 0, 8])], ... [Vectors.dense([9, 0, 0, 1])]] >>> dataset = spark.createDataFrame(dataset, ['features']) >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0] >>> print(str(pearsonCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...], [ 0.0556..., 1. , NaN, 0.9135...], [ NaN, NaN, 1. , NaN], [ 0.4004..., 0.9135..., NaN, 1. ]]) >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0] >>> print(str(spearmanCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ], [ 0.1054..., 1. , NaN, 0.9486... ], [ NaN, NaN, 1. , NaN], [ 0.4 , 0.9486... , NaN, 1. ]]) """ sc = SparkContext._active_spark_context assert sc is not None javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def _new_java_obj(java_class: str, *args: Any) -> "JavaObject": """ Returns a new Java object. """ sc = SparkContext._active_spark_context assert sc is not None java_obj = _jvm() for name in java_class.split("."): java_obj = getattr(java_obj, name) java_args = [_py2java(sc, arg) for arg in args] return java_obj(*java_args)
def test(dataset, featuresCol, labelCol, flatten=False): """ Perform a Pearson's independence test using dataset. :param dataset: DataFrame of categorical labels and categorical features. Real-valued features will be treated as categorical for each distinct value. :param featuresCol: Name of features column in dataset, of type `Vector` (`VectorUDT`). :param labelCol: Name of label column in dataset, of any numerical type. :param flatten: if True, flattens the returned dataframe. :return: DataFrame containing the test result for every feature against the label. If flatten is True, this DataFrame will contain one row per feature with the following fields: - `featureIndex: int` - `pValue: float` - `degreesOfFreedom: int` - `statistic: float` If flatten is False, this DataFrame will contain a single Row with the following fields: - `pValues: Vector` - `degreesOfFreedom: Array[int]` - `statistics: Vector` Each of these fields has one value per feature. .. versionchanged:: 3.1.0 Added optional ``flatten`` argument. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import ChiSquareTest >>> dataset = [[0, Vectors.dense([0, 0, 1])], ... [0, Vectors.dense([1, 0, 1])], ... [1, Vectors.dense([2, 1, 1])], ... [1, Vectors.dense([3, 1, 1])]] >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label') >>> chiSqResult.select("degreesOfFreedom").collect()[0] Row(degreesOfFreedom=[3, 1, 0]) >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label', True) >>> row = chiSqResult.orderBy("featureIndex").collect() >>> row[0].statistic 4.0 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.ChiSquareTest args = [ _py2java(sc, arg) for arg in (dataset, featuresCol, labelCol, flatten) ] return _java2py(sc, javaTestObj.test(*args))
def _make_java_param_pair(self, param, value): """ Makes a Java param pair. """ sc = SparkContext._active_spark_context param = self._resolveParam(param) java_param = sc._jvm.org.apache.spark.ml.param.Param(param.parent, param.name, param.doc) if isinstance(value, Params) and hasattr(value, "_to_java"): # Convert JavaEstimator/JavaTransformer object or Estimator/Transformer object which # implements `_to_java` method (such as OneVsRest, Pipeline object) to java object. # used in the case of an estimator having another estimator as a parameter # the reason why this is not in _py2java in common.py is that importing # Estimator and Model in common.py results in a circular import with inherit_doc java_value = value._to_java() else: java_value = _py2java(sc, value) return java_param.w(java_value)
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context # TODO: persist average metrics as well _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) return _java_obj
def test(dataset, sampleCol, distName, *params): """ Conduct a one-sample, two-sided Kolmogorov-Smirnov test for probability distribution equality. Currently supports the normal distribution, taking as parameters the mean and standard deviation. :param dataset: a Dataset or a DataFrame containing the sample of data to test. :param sampleCol: Name of sample column in dataset, of any numerical type. :param distName: a `string` name for a theoretical distribution, currently only support "norm". :param params: a list of `Double` values specifying the parameters to be used for the theoretical distribution. For "norm" distribution, the parameters includes mean and variance. :return: A DataFrame that contains the Kolmogorov-Smirnov test result for the input sampled data. This DataFrame will contain a single Row with the following fields: - `pValue: Double` - `statistic: Double` >>> from pyspark.ml.stat import KolmogorovSmirnovTest >>> dataset = [[-1.0], [0.0], [1.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 0.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 >>> dataset = [[2.0], [3.0], [4.0]] >>> dataset = spark.createDataFrame(dataset, ['sample']) >>> ksResult = KolmogorovSmirnovTest.test(dataset, 'sample', 'norm', 3.0, 1.0).first() >>> round(ksResult.pValue, 3) 1.0 >>> round(ksResult.statistic, 3) 0.175 """ sc = SparkContext._active_spark_context javaTestObj = _jvm().org.apache.spark.ml.stat.KolmogorovSmirnovTest dataset = _py2java(sc, dataset) params = [float(param) for param in params] return _java2py(sc, javaTestObj.test(dataset, sampleCol, distName, _jvm().PythonUtils.toSeq(params)))
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context # TODO: persst validation metrics as well _java_obj = JavaParams._new_java_obj( "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) return _java_obj
def corr(dataset, column, method="pearson"): """ Compute the correlation matrix with specified method using dataset. :param dataset: A Dataset or a DataFrame. :param column: The name of the column of vectors for which the correlation coefficient needs to be computed. This must be a column of the dataset, and it must contain Vector objects. :param method: String specifying the method to use for computing correlation. Supported: `pearson` (default), `spearman`. :return: A DataFrame that contains the correlation matrix of the column of vectors. This DataFrame contains a single row and a single column of name '$METHODNAME($COLUMN)'. >>> from pyspark.ml.linalg import Vectors >>> from pyspark.ml.stat import Correlation >>> dataset = [[Vectors.dense([1, 0, 0, -2])], ... [Vectors.dense([4, 5, 0, 3])], ... [Vectors.dense([6, 7, 0, 8])], ... [Vectors.dense([9, 0, 0, 1])]] >>> dataset = spark.createDataFrame(dataset, ['features']) >>> pearsonCorr = Correlation.corr(dataset, 'features', 'pearson').collect()[0][0] >>> print(str(pearsonCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.0556..., NaN, 0.4004...], [ 0.0556..., 1. , NaN, 0.9135...], [ NaN, NaN, 1. , NaN], [ 0.4004..., 0.9135..., NaN, 1. ]]) >>> spearmanCorr = Correlation.corr(dataset, 'features', method='spearman').collect()[0][0] >>> print(str(spearmanCorr).replace('nan', 'NaN')) DenseMatrix([[ 1. , 0.1054..., NaN, 0.4 ], [ 0.1054..., 1. , NaN, 0.9486... ], [ NaN, NaN, 1. , NaN], [ 0.4 , 0.9486... , NaN, 1. ]]) """ sc = SparkContext._active_spark_context javaCorrObj = _jvm().org.apache.spark.ml.stat.Correlation args = [_py2java(sc, arg) for arg in (dataset, column, method)] return _java2py(sc, javaCorrObj.corr(*args))
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), _py2java(sc, self.avgMetrics)) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) if self.subModels is not None: java_sub_models = [[sub_model._to_java() for sub_model in fold_sub_models] for fold_sub_models in self.subModels] _java_obj.setSubModels(java_sub_models) return _java_obj
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence. Returns ------- py4j.java_gateway.JavaObject Java object equivalent to this instance. """ sc = SparkContext._active_spark_context _java_obj = JavaParams._new_java_obj( "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid, self.bestModel._to_java(), _py2java(sc, self.validationMetrics)) estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl() params = { "evaluator": evaluator, "estimator": estimator, "estimatorParamMaps": epms, "trainRatio": self.getTrainRatio(), "seed": self.getSeed(), } for param_name, param_val in params.items(): java_param = _java_obj.getParam(param_name) pair = java_param.w(param_val) _java_obj.set(pair) if self.subModels is not None: java_sub_models = [ sub_model._to_java() for sub_model in self.subModels ] _java_obj.setSubModels(java_sub_models) return _java_obj
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context # TODO: persist average metrics as well _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) if self.subModels is not None: java_sub_models = [[sub_model._to_java() for sub_model in fold_sub_models] for fold_sub_models in self.subModels] _java_obj.setSubModels(java_sub_models) return _java_obj
def toPMMLBytes(sc, data, pipelineModel): javaData = _py2java(sc, data) javaSchema = javaData.schema.__call__() javaPipelineModel = pipelineModel._to_java() return sc._jvm.org.jpmml.sparkml.ConverterUtil.toPMMLByteArray(javaSchema, javaPipelineModel)
def _call_java(self, name, *args): m = getattr(self._java_obj, name) sc = SparkContext._active_spark_context java_args = [_py2java(sc, arg) for arg in args] return _java2py(sc, m(*java_args))