def _handleOverwrite(self, path): from pyspark.ml.wrapper import JavaWrapper _java_obj = JavaWrapper._new_java_obj( "org.apache.spark.ml.util.FileSystemOverwrite") wrapper = JavaWrapper(_java_obj) wrapper._call_java("handleOverwrite", path, True, self.sc._jsc.sc())
def _handleOverwrite(self, path: str) -> None: from pyspark.ml.wrapper import JavaWrapper _java_obj = JavaWrapper._new_java_obj( # type: ignore[attr-defined] "org.apache.spark.ml.util.FileSystemOverwrite") wrapper = JavaWrapper(_java_obj) wrapper._call_java( # type: ignore[attr-defined] "handleOverwrite", path, True, self.sparkSession._jsparkSession)
def __setCreds(conf, auth): jconf = conf._jconf field = jconf.getClass().getDeclaredField("nonJVMClientCreds") field.setAccessible(True) from pyspark.ml.wrapper import JavaWrapper creds = JavaWrapper._new_java_obj( "org.apache.spark.h2o.utils.FlowCredentials", auth[0], auth[1]) someCreds = JavaWrapper._new_java_obj("scala.Some", creds) field.set(jconf, someCreds)
def _from_java_impl(cls, java_stage): """ Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams. """ # Load information from java_stage to the instance. estimator = JavaWrapper._from_java(java_stage.getEstimator()) evaluator = JavaWrapper._from_java(java_stage.getEvaluator()) epms = [estimator._transfer_param_map_from_java(epm) for epm in java_stage.getEstimatorParamMaps()] return estimator, epms, evaluator
def _from_java_impl(cls, java_stage): """ Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams. """ # Load information from java_stage to the instance. estimator = JavaWrapper._from_java(java_stage.getEstimator()) evaluator = JavaWrapper._from_java(java_stage.getEvaluator()) epms = [ estimator._transfer_param_map_from_java(epm) for epm in java_stage.getEstimatorParamMaps() ] return estimator, epms, evaluator
def meta_estimator_transfer_param_maps_to_java(pyEstimator, pyParamMaps): pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator) stagePairs = list( map(lambda stage: (stage, stage._to_java()), pyStages)) sc = SparkContext._active_spark_context paramMapCls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap javaParamMaps = SparkContext._gateway.new_array( paramMapCls, len(pyParamMaps)) for idx, pyParamMap in enumerate(pyParamMaps): javaParamMap = JavaWrapper._new_java_obj( "org.apache.spark.ml.param.ParamMap") for pyParam, pyValue in pyParamMap.items(): javaParam = None for pyStage, javaStage in stagePairs: if pyStage._testOwnParam(pyParam.parent, pyParam.name): javaParam = javaStage.getParam(pyParam.name) break if javaParam is None: raise ValueError( 'Resolve param in estimatorParamMaps failed: ' + str(pyParam)) if isinstance(pyValue, Params) and hasattr( pyValue, "_to_java"): javaValue = pyValue._to_java() else: javaValue = _py2java(sc, pyValue) pair = javaParam.w(javaValue) javaParamMap.put([pair]) javaParamMaps[idx] = javaParamMap return javaParamMaps
def metrics(*metrics): """ Given a list of metrics, provides a builder that it turns computes metrics from a column. See the documentation of [[Summarizer]] for an example. The following metrics are accepted (case sensitive): - mean: a vector that contains the coefficient-wise mean. - variance: a vector tha contains the coefficient-wise variance. - count: the count of all vectors seen. - numNonzeros: a vector with the number of non-zeros for each coefficients - max: the maximum for each coefficient. - min: the minimum for each coefficient. - normL2: the Euclidean norm for each coefficient. - normL1: the L1 norm of each coefficient (sum of the absolute values). :param metrics: metrics that can be provided. :return: an object of :py:class:`pyspark.ml.stat.SummaryBuilder` Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD interface. """ sc = SparkContext._active_spark_context js = JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer.metrics", _to_seq(sc, metrics)) return SummaryBuilder(js)
def metrics(*metrics): """ Given a list of metrics, provides a builder that it turns computes metrics from a column. See the documentation of [[Summarizer]] for an example. The following metrics are accepted (case sensitive): - mean: a vector that contains the coefficient-wise mean. - sum: a vector that contains the coefficient-wise sum. - variance: a vector tha contains the coefficient-wise variance. - std: a vector tha contains the coefficient-wise standard deviation. - count: the count of all vectors seen. - numNonzeros: a vector with the number of non-zeros for each coefficients - max: the maximum for each coefficient. - min: the minimum for each coefficient. - normL2: the Euclidean norm for each coefficient. - normL1: the L1 norm of each coefficient (sum of the absolute values). :param metrics: metrics that can be provided. :return: an object of :py:class:`pyspark.ml.stat.SummaryBuilder` Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD interface. """ sc = SparkContext._active_spark_context js = JavaWrapper._new_java_obj( "org.apache.spark.ml.stat.Summarizer.metrics", _to_seq(sc, metrics)) return SummaryBuilder(js)
def to_java_params(sc, model, pyParamMap): paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap") for param, value in pyParamMap.items(): java_param = model._java_obj.getParam(param.name) java_value = _py2java(sc, value) paramMap.put([java_param.w(java_value)]) return paramMap
def _get_single_metric(col: Column, weightCol: Optional[Column], metric: str) -> Column: col, weightCol = Summarizer._check_param(col, weightCol) return Column( JavaWrapper._new_java_obj( "org.apache.spark.ml.stat.Summarizer." + metric, col._jc, weightCol._jc))
def _stages_java2py(java_stages): """ Transforms the parameter Python stages from a list of Java stages. :param java_stages: An array of Java stages. :return: An array of Python stages. """ return [JavaWrapper._transfer_stage_from_java(stage) for stage in java_stages]
def _stages_java2py(java_stages): """ Transforms the parameter Python stages from a list of Java stages. :param java_stages: An array of Java stages. :return: An array of Python stages. """ return [ JavaWrapper._transfer_stage_from_java(stage) for stage in java_stages ]
def _transfer_param_map_to_java(self, pyParamMap): """ Transforms a Python ParamMap into a Java ParamMap. """ paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap") for param in self.params: if param in pyParamMap: pair = self._make_java_param_pair(param, pyParamMap[param]) paramMap.put([pair]) return paramMap
def _from_java(cls, java_stage): """ Given a Java PipelineModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. py_stages = [JavaWrapper._from_java(s) for s in java_stage.stages()] # Create a new instance of this stage. py_stage = cls(py_stages) py_stage._resetUid(java_stage.uid()) return py_stage
def _py2j(self, arg): if isinstance(arg, dict): return ScalaMap(arg)._to_java() elif isinstance(arg, StructType): return JavaWrapper._new_java_obj( "org.apache.spark.sql.types.DataType.fromJson", json.dumps(arg.jsonValue())) elif isinstance(arg, SageMakerJavaWrapper): return arg._to_java() else: return arg
def test_new_java_array(self): # test array of strings str_list = ["a", "b", "c"] java_class = self.sc._gateway.jvm.java.lang.String java_array = JavaWrapper._new_java_array(str_list, java_class) self.assertEqual(_java2py(self.sc, java_array), str_list) # test array of integers int_list = [1, 2, 3] java_class = self.sc._gateway.jvm.java.lang.Integer java_array = JavaWrapper._new_java_array(int_list, java_class) self.assertEqual(_java2py(self.sc, java_array), int_list) # test array of floats float_list = [0.1, 0.2, 0.3] java_class = self.sc._gateway.jvm.java.lang.Double java_array = JavaWrapper._new_java_array(float_list, java_class) self.assertEqual(_java2py(self.sc, java_array), float_list) # test array of bools bool_list = [False, True, True] java_class = self.sc._gateway.jvm.java.lang.Boolean java_array = JavaWrapper._new_java_array(bool_list, java_class) self.assertEqual(_java2py(self.sc, java_array), bool_list) # test array of Java DenseVectors v1 = DenseVector([0.0, 1.0]) v2 = DenseVector([1.0, 0.0]) vec_java_list = [_py2java(self.sc, v1), _py2java(self.sc, v2)] java_class = self.sc._gateway.jvm.org.apache.spark.ml.linalg.DenseVector java_array = JavaWrapper._new_java_array(vec_java_list, java_class) self.assertEqual(_java2py(self.sc, java_array), [v1, v2]) # test empty array java_class = self.sc._gateway.jvm.java.lang.Integer java_array = JavaWrapper._new_java_array([], java_class) self.assertEqual(_java2py(self.sc, java_array), [])
def _from_java(cls, java_stage): """ Given a Java CrossValidatorModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. bestModel = JavaWrapper._from_java(java_stage.bestModel()) estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. py_stage = cls(bestModel=bestModel)\ .setEstimator(estimator).setEstimatorParamMaps(epms).setEvaluator(evaluator) py_stage._resetUid(java_stage.uid()) return py_stage
def _new_java_obj(self, java_class, *args): """ Creates a java object. We convert SageMakerJavaClass arguments to their java versions and then hand over to JavaWrapper :param java_class: Java ClassName :param args: constructor arguments :return: Java Instance """ java_args = [] for arg in args: java_args.append(self._py2j(arg)) return JavaWrapper._new_java_obj(java_class, *java_args)
def _from_java(cls, java_stage): """ Given a Java TrainValidationSplitModel, create and return a Python wrapper of it. Used for ML persistence. """ # Load information from java_stage to the instance. bestModel = JavaWrapper._from_java(java_stage.bestModel()) estimator, epms, evaluator = \ super(TrainValidationSplitModel, cls)._from_java_impl(java_stage) # Create a new instance of this stage. py_stage = cls(bestModel=bestModel)\ .setEstimator(estimator).setEstimatorParamMaps(epms).setEvaluator(evaluator) py_stage._resetUid(java_stage.uid()) return py_stage
def _to_java(self): """ Transfer this instance to a Java PipelineModel. Used for ML persistence. :return: Java object equivalent to this instance. """ gateway = SparkContext._gateway cls = SparkContext._jvm.org.apache.spark.ml.Transformer java_stages = gateway.new_array(cls, len(self.stages)) for idx, stage in enumerate(self.stages): java_stages[idx] = stage._to_java() _java_obj =\ JavaWrapper._new_java_obj("org.apache.spark.ml.PipelineModel", self.uid, java_stages) return _java_obj
def _to_java(self): """ Transfer this instance to a Java CrossValidator. Used for ML persistence. :return: Java object equivalent to this instance. """ estimator, epms, evaluator = super(CrossValidator, self)._to_java_impl() _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.tuning.CrossValidator", self.uid) _java_obj.setEstimatorParamMaps(epms) _java_obj.setEvaluator(evaluator) _java_obj.setEstimator(estimator) _java_obj.setSeed(self.getSeed()) _java_obj.setNumFolds(self.getNumFolds()) return _java_obj
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context _java_obj = JavaWrapper._new_java_obj( "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(TrainValidationSplitModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) return _java_obj
def _to_java(self): """ Transfer this instance to a Java CrossValidatorModel. Used for ML persistence. :return: Java object equivalent to this instance. """ sc = SparkContext._active_spark_context _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel", self.uid, self.bestModel._to_java(), _py2java(sc, [])) estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl() _java_obj.set("evaluator", evaluator) _java_obj.set("estimator", estimator) _java_obj.set("estimatorParamMaps", epms) return _java_obj
def _to_java(self): """ Transfer this instance to a Java TrainValidationSplit. Used for ML persistence. :return: Java object equivalent to this instance. """ estimator, epms, evaluator = super(TrainValidationSplit, self)._to_java_impl() _java_obj = JavaWrapper._new_java_obj( "org.apache.spark.ml.tuning.TrainValidationSplit", self.uid) _java_obj.setEstimatorParamMaps(epms) _java_obj.setEvaluator(evaluator) _java_obj.setEstimator(estimator) _java_obj.setTrainRatio(self.getTrainRatio()) _java_obj.setSeed(self.getSeed()) return _java_obj
def main(): try: # init spark spark = get_spark(app_name="pyspark-xgb") # get logger logger = get_logger(spark, "app") # load data train = spark.read.schema(get_btrain_schema()).option('header', True).csv( DATASET_PATH + '/emp_train.csv') test = spark.read.schema(get_btrain_schema()).option('header', True).csv( DATASET_PATH + '/emp_test.csv') # preprocess LABEL = 'Attrition' FEATURES = 'features' features = [c for c in train.columns if c != LABEL] assembler = VectorAssembler(inputCols=features, outputCol=FEATURES) train = assembler.transform(train).select(FEATURES, LABEL) test = assembler.transform(test).select(FEATURES, LABEL) # training logger.info('training') xgb_params = { "eta": 0.1, "gamma": 0, "max_depth": 4, "num_round": 100, "num_early_stopping_rounds": 10, "num_workers": 1, "use_external_memory": False, "missing": np.nan, "eval_metric": "logloss", "min_child_weight": 1, "train_test_ratio": 0.8, "objective": "binary:logistic" } scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params) j = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \ .setFeaturesCol(FEATURES).setLabelCol(LABEL) jmodel = j.train(train._jdf) logger.info(jmodel.summary().toString()) # get validation metric preds = jmodel.transform(test._jdf) pred = DataFrame(preds, spark) slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability')) \ .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)'] logger.info('valid logloss: {}'.format(slogloss)) # save or update model model_path = MODEL_PATH + '/model.bin' if os.path.exists(model_path): shutil.rmtree(model_path) logger.info('model exist, rm old model') jw = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel.XGBoostClassificationModelWriter", jmodel) jw.saveImpl(model_path) logger.info('save model to {}'.format(model_path)) except Exception: logger.error(traceback.print_exc()) finally: # stop spark spark.stop()
def fscore(self,fmap=""): jxgb = JavaWrapper(self.getBooster()) return jxgb._call_java("getFeatureScore",fmap)
def empty(cls): return JavaWrapper._new_java_obj("scala.Option.empty")
def _handleOverwrite(self, path): from pyspark.ml.wrapper import JavaWrapper _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.util.FileSystemOverwrite") wrapper = JavaWrapper(_java_obj) wrapper._call_java("handleOverwrite", path, True, self.sc._jsc.sc())
def _get_single_metric(col, weightCol, metric): col, weightCol = Summarizer._check_param(col, weightCol) return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric, col._jc, weightCol._jc))
def get_dump(self,fmap="",with_stats=True,format= "text"): jxgb = JavaWrapper(self.getBooster()) return jxgb._call_java("getModelDump",fmap,with_stats,format)
def main(): try: # init spark spark = get_spark(app_name="pyspark-xgb") # get logger logger = get_logger(spark, "app") # load data train = spark.read.csv(DATASET_PATH + "/iris_train.csv", get_mtrain_schema(), header=True) test = spark.read.csv(DATASET_PATH + "/iris_test.csv", get_mtrain_schema(), header=True) # preprocess # get label encode result from csv, since StringIndexer will get different result STR_LABEL = 'class' LABEL = 'label' FEATURES = 'features' N_CLASS = 3 features = [c for c in train.columns if c not in [STR_LABEL, LABEL]] assembler = VectorAssembler(inputCols=features, outputCol=FEATURES) pipeline = Pipeline(stages=[assembler]) preprocess = pipeline.fit(train) train = preprocess.transform(train).select(FEATURES, LABEL) test = preprocess.transform(test).select(FEATURES, LABEL) # set param map xgb_params = { "eta": 0.1, "eval_metric": "mlogloss", "gamma": 0, "max_depth": 5, "min_child_weight": 1.0, "objective": "multi:softprob", "seed": 0, "num_class": N_CLASS, # xgboost4j only "num_round": 100, "num_early_stopping_rounds": 10, "maximize_evaluation_metrics": False, "num_workers": 1, "use_external_memory": False, "missing": np.nan, } scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params) # set evaluation set eval_set = {'eval': test._jdf} scala_eval_set = spark._jvm.PythonUtils.toScalaMap(eval_set) logger.info('training') j = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \ .setFeaturesCol(FEATURES).setLabelCol(LABEL) \ .setEvalSets(scala_eval_set) jmodel = j.fit(train._jdf) print_summary(jmodel) # get validation metric preds = jmodel.transform(test._jdf) pred = DataFrame(preds, spark) slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability', N_CLASS)) \ .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)'] logger.info('[xgboost4j] valid logloss: {}'.format(slogloss)) # save model - using native booster for single node library to read model_path = MODEL_PATH + '/model.bin' logger.info('save model to {}'.format(model_path)) jbooster = jmodel.nativeBooster() jbooster.saveModel(model_path) # get feature score imp_type = "gain" feature_map_path = MODEL_PATH + '/feature.map' create_feature_map(feature_map_path, features) jfeatureMap = jbooster.getScore(feature_map_path, imp_type) f_imp = dict() for feature in features: if not jfeatureMap.get(feature).isEmpty(): f_imp[feature] = jfeatureMap.get(feature).get() feature_imp_path = MODEL_PATH + '/feature.imp' create_feature_imp(feature_imp_path, f_imp) # [Optional] load model training by xgboost, predict and get validation metric local_model_path = LOCAL_MODEL_PATH + '/model.bin' if os.path.exists(local_model_path): logger.info('load model from {}'.format(local_model_path)) scala_xgb = spark.sparkContext._jvm.ml.dmlc.xgboost4j.scala.XGBoost jbooster = scala_xgb.loadModel(local_model_path) # uid, num_class, booster xgb_cls_model = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel", "xgbc", N_CLASS, jbooster) jpred = xgb_cls_model.transform(test._jdf) pred = DataFrame(jpred, spark) slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability', N_CLASS)) \ .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)'] logger.info('[xgboost] valid logloss: {}'.format(slogloss)) else: logger.info( "local model is not exist, call python_xgb/train_multi.py to get the model " "and compare logloss between xgboost and xgboost4j" ) except Exception: logger.error(traceback.print_exc()) finally: # stop spark spark.stop()
def saveBooster(self, save_path): jxgb = JavaWrapper(self.getBooster()) jxgb._call_java("saveModel", save_path)
def main(): try: # init spark spark = get_spark(app_name="pyspark-xgb") # get logger logger = get_logger(spark, "app") # load data df = spark.read.csv(DATASET_PATH + "/iris.data", get_mtrain_schema()) # preprocess LABEL = 'label' FEATURES = 'features' N_CLASS = 3 features = [c for c in df.columns if c != "class"] assembler = VectorAssembler(inputCols=features, outputCol='features') strIdxer = StringIndexer(inputCol="class", outputCol=LABEL) pipeline = Pipeline(stages=[assembler, strIdxer]) df = pipeline.fit(df).transform(df).select(FEATURES, LABEL) train, test = df.randomSplit([0.8, 0.2]) # training logger.info('training') xgb_params = { "eta": 0.1, "gamma": 0, "max_depth": 4, "num_round": 100, "num_early_stopping_rounds": 10, "num_workers": 1, "use_external_memory": False, "missing": np.nan, "num_class": 3, "eval_metric": "mlogloss", "min_child_weight": 1, "train_test_ratio": 0.8, "objective": "multi:softprob" } scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params) j = JavaWrapper._new_java_obj( "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \ .setFeaturesCol(FEATURES).setLabelCol(LABEL) jmodel = j.train(train._jdf) logger.info(jmodel.summary().toString()) # get validation metric preds = jmodel.transform(test._jdf) pred = DataFrame(preds, spark) slogloss = pred.withColumn('log_loss', udf_logloss('label', 'probability', N_CLASS)) \ .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)'] logger.info('valid logloss: {}'.format(slogloss)) # save or update model model_path = MODEL_PATH + '/model.bin' if os.path.exists(model_path): shutil.rmtree(model_path) logger.info('model exist, rm old model') jmodel.save(model_path) logger.info('save model to {}'.format(model_path)) except Exception: logger.error(traceback.print_exc()) finally: # stop spark spark.stop()
def _getCreds(cls): return JavaWrapper._new_java_obj( "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")