Example #1
0
    def _handleOverwrite(self, path):
        from pyspark.ml.wrapper import JavaWrapper

        _java_obj = JavaWrapper._new_java_obj(
            "org.apache.spark.ml.util.FileSystemOverwrite")
        wrapper = JavaWrapper(_java_obj)
        wrapper._call_java("handleOverwrite", path, True, self.sc._jsc.sc())
Example #2
0
    def _handleOverwrite(self, path: str) -> None:
        from pyspark.ml.wrapper import JavaWrapper

        _java_obj = JavaWrapper._new_java_obj(  # type: ignore[attr-defined]
            "org.apache.spark.ml.util.FileSystemOverwrite")
        wrapper = JavaWrapper(_java_obj)
        wrapper._call_java(  # type: ignore[attr-defined]
            "handleOverwrite", path, True, self.sparkSession._jsparkSession)
Example #3
0
 def __setCreds(conf, auth):
     jconf = conf._jconf
     field = jconf.getClass().getDeclaredField("nonJVMClientCreds")
     field.setAccessible(True)
     from pyspark.ml.wrapper import JavaWrapper
     creds = JavaWrapper._new_java_obj(
         "org.apache.spark.h2o.utils.FlowCredentials", auth[0], auth[1])
     someCreds = JavaWrapper._new_java_obj("scala.Some", creds)
     field.set(jconf, someCreds)
Example #4
0
    def _from_java_impl(cls, java_stage):
        """
        Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams.
        """

        # Load information from java_stage to the instance.
        estimator = JavaWrapper._from_java(java_stage.getEstimator())
        evaluator = JavaWrapper._from_java(java_stage.getEvaluator())
        epms = [estimator._transfer_param_map_from_java(epm)
                for epm in java_stage.getEstimatorParamMaps()]
        return estimator, epms, evaluator
Example #5
0
    def _from_java_impl(cls, java_stage):
        """
        Return Python estimator, estimatorParamMaps, and evaluator from a Java ValidatorParams.
        """

        # Load information from java_stage to the instance.
        estimator = JavaWrapper._from_java(java_stage.getEstimator())
        evaluator = JavaWrapper._from_java(java_stage.getEvaluator())
        epms = [
            estimator._transfer_param_map_from_java(epm)
            for epm in java_stage.getEstimatorParamMaps()
        ]
        return estimator, epms, evaluator
Example #6
0
    def meta_estimator_transfer_param_maps_to_java(pyEstimator, pyParamMaps):
        pyStages = MetaAlgorithmReadWrite.getAllNestedStages(pyEstimator)
        stagePairs = list(
            map(lambda stage: (stage, stage._to_java()), pyStages))
        sc = SparkContext._active_spark_context

        paramMapCls = SparkContext._jvm.org.apache.spark.ml.param.ParamMap
        javaParamMaps = SparkContext._gateway.new_array(
            paramMapCls, len(pyParamMaps))

        for idx, pyParamMap in enumerate(pyParamMaps):
            javaParamMap = JavaWrapper._new_java_obj(
                "org.apache.spark.ml.param.ParamMap")
            for pyParam, pyValue in pyParamMap.items():
                javaParam = None
                for pyStage, javaStage in stagePairs:
                    if pyStage._testOwnParam(pyParam.parent, pyParam.name):
                        javaParam = javaStage.getParam(pyParam.name)
                        break
                if javaParam is None:
                    raise ValueError(
                        'Resolve param in estimatorParamMaps failed: ' +
                        str(pyParam))
                if isinstance(pyValue, Params) and hasattr(
                        pyValue, "_to_java"):
                    javaValue = pyValue._to_java()
                else:
                    javaValue = _py2java(sc, pyValue)
                pair = javaParam.w(javaValue)
                javaParamMap.put([pair])
            javaParamMaps[idx] = javaParamMap
        return javaParamMaps
Example #7
0
    def metrics(*metrics):
        """
        Given a list of metrics, provides a builder that it turns computes metrics from a column.

        See the documentation of [[Summarizer]] for an example.

        The following metrics are accepted (case sensitive):
         - mean: a vector that contains the coefficient-wise mean.
         - variance: a vector tha contains the coefficient-wise variance.
         - count: the count of all vectors seen.
         - numNonzeros: a vector with the number of non-zeros for each coefficients
         - max: the maximum for each coefficient.
         - min: the minimum for each coefficient.
         - normL2: the Euclidean norm for each coefficient.
         - normL1: the L1 norm of each coefficient (sum of the absolute values).

        :param metrics:
         metrics that can be provided.
        :return:
         an object of :py:class:`pyspark.ml.stat.SummaryBuilder`

        Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD
        interface.
        """
        sc = SparkContext._active_spark_context
        js = JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer.metrics",
                                       _to_seq(sc, metrics))
        return SummaryBuilder(js)
    def metrics(*metrics):
        """
        Given a list of metrics, provides a builder that it turns computes metrics from a column.

        See the documentation of [[Summarizer]] for an example.

        The following metrics are accepted (case sensitive):
         - mean: a vector that contains the coefficient-wise mean.
         - sum: a vector that contains the coefficient-wise sum.
         - variance: a vector tha contains the coefficient-wise variance.
         - std: a vector tha contains the coefficient-wise standard deviation.
         - count: the count of all vectors seen.
         - numNonzeros: a vector with the number of non-zeros for each coefficients
         - max: the maximum for each coefficient.
         - min: the minimum for each coefficient.
         - normL2: the Euclidean norm for each coefficient.
         - normL1: the L1 norm of each coefficient (sum of the absolute values).

        :param metrics:
         metrics that can be provided.
        :return:
         an object of :py:class:`pyspark.ml.stat.SummaryBuilder`

        Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD
        interface.
        """
        sc = SparkContext._active_spark_context
        js = JavaWrapper._new_java_obj(
            "org.apache.spark.ml.stat.Summarizer.metrics",
            _to_seq(sc, metrics))
        return SummaryBuilder(js)
def to_java_params(sc, model, pyParamMap):
    paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap")
    for param, value in pyParamMap.items():
        java_param = model._java_obj.getParam(param.name)
        java_value = _py2java(sc, value)
        paramMap.put([java_param.w(java_value)])
    return paramMap
Example #10
0
 def _get_single_metric(col: Column, weightCol: Optional[Column],
                        metric: str) -> Column:
     col, weightCol = Summarizer._check_param(col, weightCol)
     return Column(
         JavaWrapper._new_java_obj(
             "org.apache.spark.ml.stat.Summarizer." + metric, col._jc,
             weightCol._jc))
Example #11
0
def _stages_java2py(java_stages):
    """
    Transforms the parameter Python stages from a list of Java stages.
    :param java_stages: An array of Java stages.
    :return: An array of Python stages.
    """

    return [JavaWrapper._transfer_stage_from_java(stage) for stage in java_stages]
Example #12
0
def _stages_java2py(java_stages):
    """
    Transforms the parameter Python stages from a list of Java stages.
    :param java_stages: An array of Java stages.
    :return: An array of Python stages.
    """

    return [
        JavaWrapper._transfer_stage_from_java(stage) for stage in java_stages
    ]
 def _transfer_param_map_to_java(self, pyParamMap):
     """
     Transforms a Python ParamMap into a Java ParamMap.
     """
     paramMap = JavaWrapper._new_java_obj("org.apache.spark.ml.param.ParamMap")
     for param in self.params:
         if param in pyParamMap:
             pair = self._make_java_param_pair(param, pyParamMap[param])
             paramMap.put([pair])
     return paramMap
Example #14
0
 def _from_java(cls, java_stage):
     """
     Given a Java PipelineModel, create and return a Python wrapper of it.
     Used for ML persistence.
     """
     # Load information from java_stage to the instance.
     py_stages = [JavaWrapper._from_java(s) for s in java_stage.stages()]
     # Create a new instance of this stage.
     py_stage = cls(py_stages)
     py_stage._resetUid(java_stage.uid())
     return py_stage
Example #15
0
 def _py2j(self, arg):
     if isinstance(arg, dict):
         return ScalaMap(arg)._to_java()
     elif isinstance(arg, StructType):
         return JavaWrapper._new_java_obj(
             "org.apache.spark.sql.types.DataType.fromJson",
             json.dumps(arg.jsonValue()))
     elif isinstance(arg, SageMakerJavaWrapper):
         return arg._to_java()
     else:
         return arg
Example #16
0
 def _from_java(cls, java_stage):
     """
     Given a Java PipelineModel, create and return a Python wrapper of it.
     Used for ML persistence.
     """
     # Load information from java_stage to the instance.
     py_stages = [JavaWrapper._from_java(s) for s in java_stage.stages()]
     # Create a new instance of this stage.
     py_stage = cls(py_stages)
     py_stage._resetUid(java_stage.uid())
     return py_stage
Example #17
0
 def test_new_java_array(self):
     # test array of strings
     str_list = ["a", "b", "c"]
     java_class = self.sc._gateway.jvm.java.lang.String
     java_array = JavaWrapper._new_java_array(str_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), str_list)
     # test array of integers
     int_list = [1, 2, 3]
     java_class = self.sc._gateway.jvm.java.lang.Integer
     java_array = JavaWrapper._new_java_array(int_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), int_list)
     # test array of floats
     float_list = [0.1, 0.2, 0.3]
     java_class = self.sc._gateway.jvm.java.lang.Double
     java_array = JavaWrapper._new_java_array(float_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), float_list)
     # test array of bools
     bool_list = [False, True, True]
     java_class = self.sc._gateway.jvm.java.lang.Boolean
     java_array = JavaWrapper._new_java_array(bool_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), bool_list)
     # test array of Java DenseVectors
     v1 = DenseVector([0.0, 1.0])
     v2 = DenseVector([1.0, 0.0])
     vec_java_list = [_py2java(self.sc, v1), _py2java(self.sc, v2)]
     java_class = self.sc._gateway.jvm.org.apache.spark.ml.linalg.DenseVector
     java_array = JavaWrapper._new_java_array(vec_java_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), [v1, v2])
     # test empty array
     java_class = self.sc._gateway.jvm.java.lang.Integer
     java_array = JavaWrapper._new_java_array([], java_class)
     self.assertEqual(_java2py(self.sc, java_array), [])
Example #18
0
 def test_new_java_array(self):
     # test array of strings
     str_list = ["a", "b", "c"]
     java_class = self.sc._gateway.jvm.java.lang.String
     java_array = JavaWrapper._new_java_array(str_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), str_list)
     # test array of integers
     int_list = [1, 2, 3]
     java_class = self.sc._gateway.jvm.java.lang.Integer
     java_array = JavaWrapper._new_java_array(int_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), int_list)
     # test array of floats
     float_list = [0.1, 0.2, 0.3]
     java_class = self.sc._gateway.jvm.java.lang.Double
     java_array = JavaWrapper._new_java_array(float_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), float_list)
     # test array of bools
     bool_list = [False, True, True]
     java_class = self.sc._gateway.jvm.java.lang.Boolean
     java_array = JavaWrapper._new_java_array(bool_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), bool_list)
     # test array of Java DenseVectors
     v1 = DenseVector([0.0, 1.0])
     v2 = DenseVector([1.0, 0.0])
     vec_java_list = [_py2java(self.sc, v1), _py2java(self.sc, v2)]
     java_class = self.sc._gateway.jvm.org.apache.spark.ml.linalg.DenseVector
     java_array = JavaWrapper._new_java_array(vec_java_list, java_class)
     self.assertEqual(_java2py(self.sc, java_array), [v1, v2])
     # test empty array
     java_class = self.sc._gateway.jvm.java.lang.Integer
     java_array = JavaWrapper._new_java_array([], java_class)
     self.assertEqual(_java2py(self.sc, java_array), [])
Example #19
0
    def _from_java(cls, java_stage):
        """
        Given a Java CrossValidatorModel, create and return a Python wrapper of it.
        Used for ML persistence.
        """

        # Load information from java_stage to the instance.
        bestModel = JavaWrapper._from_java(java_stage.bestModel())
        estimator, epms, evaluator = super(CrossValidatorModel, cls)._from_java_impl(java_stage)
        # Create a new instance of this stage.
        py_stage = cls(bestModel=bestModel)\
            .setEstimator(estimator).setEstimatorParamMaps(epms).setEvaluator(evaluator)
        py_stage._resetUid(java_stage.uid())
        return py_stage
Example #20
0
    def _new_java_obj(self, java_class, *args):
        """
        Creates a java object. We convert SageMakerJavaClass arguments
        to their java versions and then hand over to JavaWrapper

        :param java_class: Java ClassName
        :param args: constructor arguments
        :return: Java Instance
        """

        java_args = []
        for arg in args:
            java_args.append(self._py2j(arg))

        return JavaWrapper._new_java_obj(java_class, *java_args)
Example #21
0
    def _from_java(cls, java_stage):
        """
        Given a Java TrainValidationSplitModel, create and return a Python wrapper of it.
        Used for ML persistence.
        """

        # Load information from java_stage to the instance.
        bestModel = JavaWrapper._from_java(java_stage.bestModel())
        estimator, epms, evaluator = \
            super(TrainValidationSplitModel, cls)._from_java_impl(java_stage)
        # Create a new instance of this stage.
        py_stage = cls(bestModel=bestModel)\
            .setEstimator(estimator).setEstimatorParamMaps(epms).setEvaluator(evaluator)
        py_stage._resetUid(java_stage.uid())
        return py_stage
Example #22
0
    def _to_java(self):
        """
        Transfer this instance to a Java PipelineModel.  Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        gateway = SparkContext._gateway
        cls = SparkContext._jvm.org.apache.spark.ml.Transformer
        java_stages = gateway.new_array(cls, len(self.stages))
        for idx, stage in enumerate(self.stages):
            java_stages[idx] = stage._to_java()

        _java_obj =\
            JavaWrapper._new_java_obj("org.apache.spark.ml.PipelineModel", self.uid, java_stages)

        return _java_obj
Example #23
0
    def _to_java(self):
        """
        Transfer this instance to a Java PipelineModel.  Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        gateway = SparkContext._gateway
        cls = SparkContext._jvm.org.apache.spark.ml.Transformer
        java_stages = gateway.new_array(cls, len(self.stages))
        for idx, stage in enumerate(self.stages):
            java_stages[idx] = stage._to_java()

        _java_obj =\
            JavaWrapper._new_java_obj("org.apache.spark.ml.PipelineModel", self.uid, java_stages)

        return _java_obj
Example #24
0
    def _to_java(self):
        """
        Transfer this instance to a Java CrossValidator. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        estimator, epms, evaluator = super(CrossValidator, self)._to_java_impl()

        _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.tuning.CrossValidator", self.uid)
        _java_obj.setEstimatorParamMaps(epms)
        _java_obj.setEvaluator(evaluator)
        _java_obj.setEstimator(estimator)
        _java_obj.setSeed(self.getSeed())
        _java_obj.setNumFolds(self.getNumFolds())

        return _java_obj
Example #25
0
    def _to_java(self):
        """
        Transfer this instance to a Java TrainValidationSplitModel. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context

        _java_obj = JavaWrapper._new_java_obj(
            "org.apache.spark.ml.tuning.TrainValidationSplitModel", self.uid,
            self.bestModel._to_java(), _py2java(sc, []))
        estimator, epms, evaluator = super(TrainValidationSplitModel,
                                           self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)
        return _java_obj
Example #26
0
    def _to_java(self):
        """
        Transfer this instance to a Java CrossValidatorModel. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context

        _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel",
                                              self.uid,
                                              self.bestModel._to_java(),
                                              _py2java(sc, []))
        estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)
        return _java_obj
Example #27
0
    def _to_java(self):
        """
        Transfer this instance to a Java TrainValidationSplit. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        estimator, epms, evaluator = super(TrainValidationSplit,
                                           self)._to_java_impl()

        _java_obj = JavaWrapper._new_java_obj(
            "org.apache.spark.ml.tuning.TrainValidationSplit", self.uid)
        _java_obj.setEstimatorParamMaps(epms)
        _java_obj.setEvaluator(evaluator)
        _java_obj.setEstimator(estimator)
        _java_obj.setTrainRatio(self.getTrainRatio())
        _java_obj.setSeed(self.getSeed())

        return _java_obj
Example #28
0
def main():

    try:

        # init spark
        spark = get_spark(app_name="pyspark-xgb")

        # get logger
        logger = get_logger(spark, "app")

        # load data
        train = spark.read.schema(get_btrain_schema()).option('header', True).csv(
            DATASET_PATH + '/emp_train.csv')
        test = spark.read.schema(get_btrain_schema()).option('header', True).csv(
            DATASET_PATH + '/emp_test.csv')

        # preprocess
        LABEL = 'Attrition'
        FEATURES = 'features'
        features = [c for c in train.columns if c != LABEL]
        assembler = VectorAssembler(inputCols=features, outputCol=FEATURES)
        train = assembler.transform(train).select(FEATURES, LABEL)
        test = assembler.transform(test).select(FEATURES, LABEL)

        # training
        logger.info('training')
        xgb_params = {
            "eta": 0.1, "gamma": 0, "max_depth": 4,
            "num_round": 100, "num_early_stopping_rounds": 10,
            "num_workers": 1, "use_external_memory": False, "missing": np.nan,
            "eval_metric": "logloss", "min_child_weight": 1, "train_test_ratio": 0.8,
            "objective": "binary:logistic"
        }
        scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params)
        j = JavaWrapper._new_java_obj(
            "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \
            .setFeaturesCol(FEATURES).setLabelCol(LABEL)
        jmodel = j.train(train._jdf)
        logger.info(jmodel.summary().toString())

        # get validation metric
        preds = jmodel.transform(test._jdf)
        pred = DataFrame(preds, spark)
        slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability')) \
            .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)']
        logger.info('valid logloss: {}'.format(slogloss))

        # save or update model
        model_path = MODEL_PATH + '/model.bin'
        if os.path.exists(model_path):
            shutil.rmtree(model_path)
            logger.info('model exist, rm old model')
        jw = JavaWrapper._new_java_obj(
            "ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel.XGBoostClassificationModelWriter", jmodel)
        jw.saveImpl(model_path)
        logger.info('save model to {}'.format(model_path))

    except Exception:
        logger.error(traceback.print_exc())

    finally:
        # stop spark
        spark.stop()
Example #29
0
 def fscore(self,fmap=""):
     jxgb = JavaWrapper(self.getBooster())
     return jxgb._call_java("getFeatureScore",fmap)
Example #30
0
 def empty(cls):
     return JavaWrapper._new_java_obj("scala.Option.empty")
Example #31
0
    def _handleOverwrite(self, path):
        from pyspark.ml.wrapper import JavaWrapper

        _java_obj = JavaWrapper._new_java_obj("org.apache.spark.ml.util.FileSystemOverwrite")
        wrapper = JavaWrapper(_java_obj)
        wrapper._call_java("handleOverwrite", path, True, self.sc._jsc.sc())
Example #32
0
 def _get_single_metric(col, weightCol, metric):
     col, weightCol = Summarizer._check_param(col, weightCol)
     return Column(JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer." + metric,
                                             col._jc, weightCol._jc))
Example #33
0
 def get_dump(self,fmap="",with_stats=True,format= "text"):
     jxgb = JavaWrapper(self.getBooster())
     return jxgb._call_java("getModelDump",fmap,with_stats,format)
def main():

    try:

        # init spark
        spark = get_spark(app_name="pyspark-xgb")

        # get logger
        logger = get_logger(spark, "app")

        # load data
        train = spark.read.csv(DATASET_PATH + "/iris_train.csv",
                    get_mtrain_schema(),
                    header=True)
        test = spark.read.csv(DATASET_PATH + "/iris_test.csv",
                    get_mtrain_schema(),
                    header=True)

        # preprocess
        # get label encode result from csv, since StringIndexer will get different result
        STR_LABEL = 'class'
        LABEL = 'label'
        FEATURES = 'features'
        N_CLASS = 3
        features = [c for c in train.columns if c not in [STR_LABEL, LABEL]]
        assembler = VectorAssembler(inputCols=features, outputCol=FEATURES)
        pipeline = Pipeline(stages=[assembler])
        preprocess = pipeline.fit(train)
        train = preprocess.transform(train).select(FEATURES, LABEL)
        test = preprocess.transform(test).select(FEATURES, LABEL)

        # set param map
        xgb_params = {
            "eta": 0.1, "eval_metric": "mlogloss",
            "gamma": 0, "max_depth": 5, "min_child_weight": 1.0,
            "objective": "multi:softprob", "seed": 0,
            "num_class": N_CLASS,
            # xgboost4j only
            "num_round": 100, "num_early_stopping_rounds": 10,
            "maximize_evaluation_metrics": False,
            "num_workers": 1, "use_external_memory": False,
            "missing": np.nan,
        }
        scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params)

        # set evaluation set
        eval_set = {'eval': test._jdf}
        scala_eval_set = spark._jvm.PythonUtils.toScalaMap(eval_set)

        logger.info('training')
        j = JavaWrapper._new_java_obj(
            "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \
            .setFeaturesCol(FEATURES).setLabelCol(LABEL) \
            .setEvalSets(scala_eval_set)
        jmodel = j.fit(train._jdf)
        print_summary(jmodel)

        # get validation metric
        preds = jmodel.transform(test._jdf)
        pred = DataFrame(preds, spark)
        slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability', N_CLASS)) \
            .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)']
        logger.info('[xgboost4j] valid logloss: {}'.format(slogloss))

        # save model - using native booster for single node library to read
        model_path = MODEL_PATH + '/model.bin'
        logger.info('save model to {}'.format(model_path))
        jbooster = jmodel.nativeBooster()
        jbooster.saveModel(model_path)

        # get feature score
        imp_type = "gain"
        feature_map_path = MODEL_PATH + '/feature.map'
        create_feature_map(feature_map_path, features)
        jfeatureMap = jbooster.getScore(feature_map_path, imp_type)
        f_imp = dict()
        for feature in features:
            if not jfeatureMap.get(feature).isEmpty():
                f_imp[feature] = jfeatureMap.get(feature).get()
        feature_imp_path = MODEL_PATH + '/feature.imp'
        create_feature_imp(feature_imp_path, f_imp)

        # [Optional] load model training by xgboost, predict and get validation metric
        local_model_path = LOCAL_MODEL_PATH + '/model.bin'
        if os.path.exists(local_model_path):
            logger.info('load model from {}'.format(local_model_path))
            scala_xgb = spark.sparkContext._jvm.ml.dmlc.xgboost4j.scala.XGBoost
            jbooster = scala_xgb.loadModel(local_model_path)

            # uid, num_class, booster
            xgb_cls_model = JavaWrapper._new_java_obj(
                "ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel",
                "xgbc", N_CLASS, jbooster)

            jpred = xgb_cls_model.transform(test._jdf)
            pred = DataFrame(jpred, spark)
            slogloss = pred.withColumn('log_loss', udf_logloss(LABEL, 'probability', N_CLASS)) \
                .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)']
            logger.info('[xgboost] valid logloss: {}'.format(slogloss))
        else:
            logger.info(
                "local model is not exist, call python_xgb/train_multi.py to get the model "
                "and compare logloss between xgboost and xgboost4j"
            )

    except Exception:
        logger.error(traceback.print_exc())

    finally:
        # stop spark
        spark.stop()
Example #35
0
 def saveBooster(self, save_path):
     jxgb = JavaWrapper(self.getBooster())
     jxgb._call_java("saveModel", save_path)
Example #36
0
def main():

    try:

        # init spark
        spark = get_spark(app_name="pyspark-xgb")

        # get logger
        logger = get_logger(spark, "app")

        # load data
        df = spark.read.csv(DATASET_PATH + "/iris.data", get_mtrain_schema())

        # preprocess
        LABEL = 'label'
        FEATURES = 'features'
        N_CLASS = 3
        features = [c for c in df.columns if c != "class"]
        assembler = VectorAssembler(inputCols=features, outputCol='features')
        strIdxer = StringIndexer(inputCol="class", outputCol=LABEL)
        pipeline = Pipeline(stages=[assembler, strIdxer])
        df = pipeline.fit(df).transform(df).select(FEATURES, LABEL)
        train, test = df.randomSplit([0.8, 0.2])

        # training
        logger.info('training')
        xgb_params = {
            "eta": 0.1, "gamma": 0, "max_depth": 4,
            "num_round": 100, "num_early_stopping_rounds": 10,
            "num_workers": 1, "use_external_memory": False, "missing": np.nan,
            "num_class": 3, "eval_metric": "mlogloss",
            "min_child_weight": 1, "train_test_ratio": 0.8,
            "objective": "multi:softprob"
        }
        scala_map = spark._jvm.PythonUtils.toScalaMap(xgb_params)
        j = JavaWrapper._new_java_obj(
            "ml.dmlc.xgboost4j.scala.spark.XGBoostClassifier", scala_map) \
            .setFeaturesCol(FEATURES).setLabelCol(LABEL)
        jmodel = j.train(train._jdf)
        logger.info(jmodel.summary().toString())

        # get validation metric
        preds = jmodel.transform(test._jdf)
        pred = DataFrame(preds, spark)
        slogloss = pred.withColumn('log_loss', udf_logloss('label', 'probability', N_CLASS)) \
            .agg({"log_loss": "mean"}).collect()[0]['avg(log_loss)']
        logger.info('valid logloss: {}'.format(slogloss))

        # save or update model
        model_path = MODEL_PATH + '/model.bin'
        if os.path.exists(model_path):
            shutil.rmtree(model_path)
            logger.info('model exist, rm old model')
        jmodel.save(model_path)
        logger.info('save model to {}'.format(model_path))

    except Exception:
        logger.error(traceback.print_exc())

    finally:
        # stop spark
        spark.stop()
Example #37
0
 def _getCreds(cls):
     return JavaWrapper._new_java_obj(
         "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")