def _from_java_impl(cls, java_stage): """ Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams. """ # Load information from java_stage to the instance. estimator = JavaParams._from_java(java_stage.getEstimator()) evaluator = JavaParams._from_java(java_stage.getEvaluator()) epms = [estimator._transfer_param_map_from_java(epm) for epm in java_stage.getEstimatorParamMaps()] return estimator, epms, evaluator
def _from_java(cls, java_stage): """ Given a Java CrossValidatorModel, create and return a Python wrapper of it. Used for ML persistence. """ bestModel = JavaParams._from_java(java_stage.bestModel()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) py_stage = cls(bestModel=bestModel).setEstimator(estimator) py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator) if java_stage.hasSubModels(): py_stage.subModels = [[JavaParams._from_java(sub_model) for sub_model in fold_sub_models] for fold_sub_models in java_stage.subModels()] py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java(cls, java_stage): """ Given a Java TrainValidationSplitModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. bestModel = JavaParams._from_java(java_stage.bestModel()) estimator, epms, evaluator = super(TrainValidationSplitModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. py_stage = cls(bestModel=bestModel).setEstimator(estimator) py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator) if java_stage.hasSubModels(): py_stage.subModels = [JavaParams._from_java(sub_model) for sub_model in java_stage.subModels()] py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java(cls, java_stage): """ Given a Java PipelineModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. py_stages = [JavaParams._from_java(s) for s in java_stage.stages()] # Create a new instance of this stage. py_stage = cls(py_stages) py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java_impl(cls, java_stage): """ Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams. """ # Load information from java_stage to the instance. estimator = JavaParams._from_java(java_stage.getEstimator()) evaluator = JavaParams._from_java(java_stage.getEvaluator()) if isinstance(estimator, JavaEstimator): epms = [ estimator._transfer_param_map_from_java(epm) for epm in java_stage.getEstimatorParamMaps() ] elif MetaAlgorithmReadWrite.isMetaEstimator(estimator): # Meta estimator such as Pipeline, OneVsRest epms = _ValidatorSharedReadWrite.meta_estimator_transfer_param_maps_from_java( estimator, java_stage.getEstimatorParamMaps()) else: raise ValueError('Unsupported estimator used in tuning: ' + str(estimator)) return estimator, epms, evaluator
def loadNativeModelFromString(model, labelColName="label", featuresColName="features", predictionColName="prediction"): """ Load the model from a native LightGBM model string. """ ctx = SparkContext._active_spark_context loader = ctx._jvm.com.microsoft.ml.spark.LightGBMRegressionModel java_model = loader.loadNativeModelFromString(model, labelColName, featuresColName, predictionColName) return JavaParams._from_java(java_model)
def loadNativeModelFromFile(filename, labelColName="label", featuresColName="features", predictionColName="prediction"): """ Load the model from a native LightGBM text file. """ ctx = SparkContext._active_spark_context loader = ctx._jvm.com.microsoft.ml.spark.LightGBMRankerModel java_model = loader.loadNativeModelFromFile(filename, labelColName, featuresColName, predictionColName) return JavaParams._from_java(java_model)
def _from_java(cls, java_stage: "JavaObject") -> "Pipeline": """ Given a Java Pipeline, create and return a Python wrapper of it. Used for ML persistence. """ # Create a new instance of this stage. py_stage = cls() # Load information from java_stage to the instance. py_stages: List["PipelineStage"] = [ JavaParams._from_java(s) for s in java_stage.getStages() ] py_stage.setStages(py_stages) py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java(cls, java_stage): """ Given a Java TrainValidationSplitModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. sc = SparkContext._active_spark_context bestModel = JavaParams._from_java(java_stage.bestModel()) validationMetrics = _java2py(sc, java_stage.validationMetrics()) estimator, epms, evaluator = super(TrainValidationSplitModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. py_stage = cls(bestModel=bestModel, validationMetrics=validationMetrics)._set(estimator=estimator) py_stage = py_stage._set(estimatorParamMaps=epms)._set(evaluator=evaluator) if java_stage.hasSubModels(): py_stage.subModels = [JavaParams._from_java(sub_model) for sub_model in java_stage.subModels()] py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java(cls, java_stage): """ Given a Java CrossValidatorModel, create and return a Python wrapper of it. Used for ML persistence. """ sc = SparkContext._active_spark_context bestModel = JavaParams._from_java(java_stage.bestModel()) avgMetrics = _java2py(sc, java_stage.avgMetrics()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) py_stage = cls(bestModel=bestModel, avgMetrics=avgMetrics).setEstimator(estimator) py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator) if java_stage.hasSubModels(): py_stage.subModels = [[ JavaParams._from_java(sub_model) for sub_model in fold_sub_models ] for fold_sub_models in java_stage.subModels()] py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java(cls, java_stage): """ Given a Java CrossValidatorModel, create and return a Python wrapper of it. Used for ML persistence. """ bestModel = JavaParams._from_java(java_stage.bestModel()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) py_stage = cls(bestModel=bestModel).setEstimator(estimator) py_stage = py_stage.setEstimatorParamMaps(epms).setEvaluator(evaluator) py_stage._resetUid(java_stage.uid()) return py_stage
def _from_java(cls, java_stage): """ Given a Java CrossValidatorModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. bestModel = JavaParams._from_java(java_stage.bestModel()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. py_stage = cls(bestModel=bestModel)\ .setEstimator(estimator).setEstimatorParamMaps(epms).setEvaluator(evaluator) py_stage._resetUid(java_stage.uid()) return py_stage
def _to_java(self): estimator, epms, evaluator = _ValidatorParams._to_java_impl(self) _java_obj = JavaParams._new_java_obj( "com.microsoft.azure.synapse.ml.recommendation.RankingTrainValidationSplit", self.uid) _java_obj.setEstimatorParamMaps(epms) _java_obj.setEvaluator(evaluator) _java_obj.setEstimator(estimator) _java_obj.setTrainRatio(self.getTrainRatio()) _java_obj.setSeed(self.getSeed()) _java_obj.setItemCol(self.getItemCol()) _java_obj.setUserCol(self.getUserCol()) _java_obj.setRatingCol(self.getRatingCol()) return _java_obj
def _to_java(self): """ Convert this instance to a dill dump, then to a list of strings with the unicode integer values of each character. Use this list as a set of dumby stopwords and store in a StopWordsRemover instance :return: Java object equivalent to this instance. """ dmp = dill.dumps(self) pylist = [str(ord(d)) for d in dmp] # convert byes to string integer list pylist.append(PysparkObjId._getPyObjId()) # add our id so PysparkPipelineWrapper can id us. sc = SparkContext._active_spark_context java_class = sc._gateway.jvm.java.lang.String java_array = sc._gateway.new_array(java_class, len(pylist)) for i in xrange(len(pylist)): java_array[i] = pylist[i] _java_obj = JavaParams._new_java_obj(PysparkObjId._getCarrierClass(javaName=True), self.uid) _java_obj.setStopWords(java_array) return _java_obj
def _to_java(self): """ Transfer this instance to a Java PipelineModel. Used for ML persistence. :return: Java object equivalent to this instance. """ gateway = SparkContext._gateway cls = SparkContext._jvm.org.apache.spark.ml.Transformer java_stages = gateway.new_array(cls, len(self.stages)) for idx, stage in enumerate(self.stages): java_stages[idx] = stage._to_java() _java_obj =\ JavaParams._new_java_obj("org.apache.spark.ml.PipelineModel", self.uid, java_stages) return _java_obj
def _to_java(self): """ Transfer this instance to a Java CrossValidator. Used for ML persistence. :return: Java object equivalent to this instance. """ estimator, epms, evaluator = super(CrossValidator, self)._to_java_impl() _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidator", self.uid) _java_obj.setEstimatorParamMaps(epms) _java_obj.setEvaluator(evaluator) _java_obj.setEstimator(estimator) _java_obj.setSeed(self.getSeed()) _java_obj.setNumFolds(self.getNumFolds()) return _java_obj
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. :return: Java object equivalent to this instance. """ _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), self.avgMetrics) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) return _java_obj
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplit. Used for ML persistence. :return: Java object equivalent to this instance. """ estimator, epms, evaluator = super(TrainValidationSplit, self)._to_java_impl() _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.TrainValidationSplit", self.uid) _java_obj.setEstimatorParamMaps(epms) _java_obj.setEvaluator(evaluator) _java_obj.setEstimator(estimator) _java_obj.setTrainRatio(self.getTrainRatio()) _java_obj.setSeed(self.getSeed()) _java_obj.setParallelism(self.getParallelism()) return _java_obj
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context # TODO: persst validation metrics as well _java_obj = JavaParams._new_java_obj( "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) return _java_obj
def _transfer_param_map_from_java(self, javaParamMap): """ Transforms a Java ParamMap into a Python ParamMap. """ sc = SparkContext._active_spark_context paramMap = dict() for pair in javaParamMap.toList(): param = pair.param() if self.hasParam(str(param.name())): java_obj = pair.value() if sc._jvm.Class.forName("org.apache.spark.ml.PipelineStage").isInstance(java_obj): # Note: JavaParams._from_java support both JavaEstimator/JavaTransformer class # and Estimator/Transformer class which implements `_from_java` static method # (such as OneVsRest, Pipeline class). py_obj = JavaParams._from_java(java_obj) else: py_obj = _java2py(sc, java_obj) paramMap[self.getParam(param.name())] = py_obj return paramMap
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplit. Used for ML persistence. :return: Java object equivalent to this instance. """ estimator, epms, evaluator = super(RankingTrainValidationSplit, self)._to_java_impl() _java_obj = JavaParams._new_java_obj("com.microsoft.ml.spark.RankingTrainValidationSplit", self.uid) _java_obj.setEstimatorParamMaps(epms) _java_obj.setEvaluator(evaluator) _java_obj.setEstimator(estimator) _java_obj.setTrainRatio(self.getTrainRatio()) _java_obj.setSeed(self.getSeed()) _java_obj.setItemCol(self.getItemCol()) _java_obj.setUserCol(self.getUserCol()) _java_obj.setRatingCol(self.getRatingCol()) return _java_obj
def _to_java(self): """ Transfer this instance to a Java Pipeline. Used for ML persistence. Returns ------- py4j.java_gateway.JavaObject Java object equivalent to this instance. """ gateway = SparkContext._gateway cls = SparkContext._jvm.org.apache.spark.ml.PipelineStage java_stages = gateway.new_array(cls, len(self.getStages())) for idx, stage in enumerate(self.getStages()): java_stages[idx] = stage._to_java() _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.Pipeline", self.uid) _java_obj.setStages(java_stages) return _java_obj
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context _java_obj = JavaParams._new_java_obj( "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid, self.bestModel._to_java(), _py2java(sc, self.validationMetrics)) estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) if self.subModels is not None: java_sub_models = [sub_model._to_java() for sub_model in self.subModels] _java_obj.setSubModels(java_sub_models) return _java_obj
def _bucketize(df, input_cols): def j_str_arr(arr): gateway = SparkContext._gateway j_str = gateway.jvm.java.lang.String j_arr = gateway.new_array(j_str, len(arr)) for i, val in enumerate(arr): j_arr[i] = val return j_arr output_cols = ['{}-bucketed'.format(x) for x in input_cols] # Sadly the multi-col versions are only in scala, pyspark doesn't # have them yet. j_bucketizer = (JavaParams._new_java_obj( "org.apache.spark.ml.feature.QuantileDiscretizer").setInputCols( j_str_arr(input_cols)).setOutputCols( j_str_arr(output_cols)).setNumBuckets(254).setRelativeError( 1 / 2550).setHandleInvalid('error').fit(df._jdf)) j_df_bucketized = j_bucketizer.transform(df._jdf) df_bucketized = DataFrame(j_df_bucketized, df.sql_ctx).drop(*input_cols) # Now we need to assemble the bucketized values into vector # form for the feature selector to work with. assembler = VectorAssembler(inputCols=output_cols, outputCol='features') return assembler.transform(df_bucketized).drop(*output_cols)
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. Returns ------- py4j.java_gateway.JavaObject Java object equivalent to this instance. """ sc = SparkContext._active_spark_context _java_obj = JavaParams._new_java_obj( "org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), _py2java(sc, self.avgMetrics)) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() params = { "evaluator": evaluator, "estimator": estimator, "estimatorParamMaps": epms, "numFolds": self.getNumFolds(), "foldCol": self.getFoldCol(), "seed": self.getSeed(), } for param_name, param_val in params.items(): java_param = _java_obj.getParam(param_name) pair = java_param.w(param_val) _java_obj.set(pair) if self.subModels is not None: java_sub_models = [[ sub_model._to_java() for sub_model in fold_sub_models ] for fold_sub_models in self.subModels] _java_obj.setSubModels(java_sub_models) return _java_obj
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context # TODO: persist average metrics as well _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) if self.subModels is not None: java_sub_models = [[sub_model._to_java() for sub_model in fold_sub_models] for fold_sub_models in self.subModels] _java_obj.setSubModels(java_sub_models) return _java_obj
def getBestModel(self): """ Returns the best model. """ return JavaParams._from_java(self._java_obj.getBestModel())
def getModel(self): """ Get the underlying model. """ return JavaParams._from_java(self._java_obj.getModel())