Ejemplo n.º 1
0
    def test_element_wise_product(self):
        data = self.spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), )],
                                          ["features"])
        model = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
                                   inputCol="features",
                                   outputCol="eprod")
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml ElementwiseProduct',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().eprod.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        ]
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlElementwiseProduct")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['eprod'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 2
0
    def test_xgboost_booster_classifier_reg(self):
        x, y = make_classification(n_classes=2,
                                   n_features=5,
                                   n_samples=100,
                                   random_state=42,
                                   n_informative=3)
        y = y.astype(np.float32) + 0.567
        x_train, x_test, y_train, _ = train_test_split(x,
                                                       y,
                                                       test_size=0.5,
                                                       random_state=42)

        data = DMatrix(x_train, label=y_train)
        model = train(
            {
                'objective': 'reg:squarederror',
                'n_estimators': 3,
                'min_child_samples': 1
            }, data)
        model_onnx = convert_xgboost(
            model, 'tree-based classifier',
            [('input', FloatTensorType([None, x.shape[1]]))])
        dump_data_and_model(
            x_test.astype(np.float32),
            model,
            model_onnx,
            allow_failure=
            "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
            basename="XGBBoosterReg")
Ejemplo n.º 3
0
 def test_gbt_classifier(self):
     raw_data = self.spark.createDataFrame(
         [(1.0, Vectors.dense(1.0)),
          (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
     si_model = string_indexer.fit(raw_data)
     data = si_model.transform(raw_data)
     gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml GBT Classifier',
         [('features', FloatTensorType([1, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(
             lambda x: pandas.Series(x.toArray())).values.astype(
                 numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlGBTClassifier")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['prediction', 'probability'],
                                            data_np, onnx_model_path)
     compare_results(expected, output, decimal=5)
Ejemplo n.º 4
0
    def test_xgboost_example_mnist(self):
        """
        Train a simple xgboost model and store associated artefacts.
        """
        X, y = load_digits(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        X_train = X_train.reshape((X_train.shape[0], -1))
        X_test = X_test.reshape((X_test.shape[0], -1))

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        clf = XGBClassifier(objective="multi:softprob", n_jobs=-1)
        clf.fit(X_train, y_train)

        sh = [None, X_train.shape[1]]
        onnx_model = convert_xgboost(clf,
                                     initial_types=[('input',
                                                     FloatTensorType(sh))])

        dump_data_and_model(
            X_test.astype(np.float32),
            clf,
            onnx_model,
            allow_failure=
            "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
            basename="XGBoostExample")
Ejemplo n.º 5
0
 def _test_one_class_classification_core(self, model):
     X = [[0., 1.], [1., 1.], [2., 0.]]
     y = [1, 1, 1]
     model.fit(X, y)
     model_onnx = convert_sklearn(model, 'tree-based classifier',
                                  [('input', FloatTensorType([1, 2]))])
     self.assertTrue(model_onnx is not None)
Ejemplo n.º 6
0
 def test_xgb_regressor(self):
     iris = load_diabetes()
     x = iris.data
     y = iris.target
     x_train, x_test, y_train, _ = train_test_split(x,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=42)
     xgb = XGBRegressor()
     xgb.fit(x_train, y_train)
     conv_model = convert_xgboost(xgb,
                                  initial_types=[
                                      ('input',
                                       FloatTensorType(shape=[None, None]))
                                  ])
     self.assertTrue(conv_model is not None)
     dump_data_and_model(
         x_test.astype("float32"),
         xgb,
         conv_model,
         basename="SklearnXGBRegressor-Dec3",
         allow_failure="StrictVersion("
         "onnx.__version__)"
         "< StrictVersion('1.3.0')",
     )
Ejemplo n.º 7
0
 def test_aft_regression_survival(self):
     data = self.spark.createDataFrame(
         [(1.0, Vectors.dense(1.0), 1.0),
          (1e-40, Vectors.sparse(1, [], []), 0.0)],
         ["label", "features", "censor"])
     gbt = AFTSurvivalRegression()
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml AFTSurvivalRegression',
         [('features', FloatTensorType([1, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlAFTSurvivalRegression")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Ejemplo n.º 8
0
    def test_maxabs_scaler(self):
        data = self.spark.createDataFrame([(
            0,
            Vectors.dense([1.0, 0.1, -1.0]),
        ), (
            1,
            Vectors.dense([2.0, 1.1, 1.0]),
        ), (
            2,
            Vectors.dense([3.0, 10.1, 3.0]),
        )], ["id", "features"])
        scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(
            model, 'Sparkml MaxAbsScaler',
            [('features', FloatTensorType([None, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlMaxAbsScaler")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 9
0
    def test_convert_nusvmc(self):
        iris = load_iris()

        X = iris.data[:, :2]
        y = iris.target
        y[y == 2] = 1

        prob = svmutil.svm_problem(y, X.tolist())

        param = svmutil.svm_parameter()
        param.svm_type = NuSVC
        param.kernel_type = svmutil.RBF
        param.eps = 1
        param.probability = 1
        if noprint:
            param.print_func = noprint

        libsvm_model = svmutil.svm_train(prob, param)

        node = convert(libsvm_model, "LibSvmNuSvmc",
                       [('input', FloatTensorType(shape=['None', 'None']))])
        self.assertTrue(node is not None)
        dump_data_and_model(
            X[:5].astype(numpy.float32),
            SkAPIClProba2(libsvm_model),
            node,
            basename="LibSvmNuSvmc-Dec2",
            allow_failure=
            "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3')")
Ejemplo n.º 10
0
    def test_convert_svmc_linear_raw_multi(self):
        iris = load_iris()

        X = iris.data[:, :2]
        y = iris.target
        y[-5:] = 3

        prob = svmutil.svm_problem(y, X.tolist())

        param = svmutil.svm_parameter()
        param.svm_type = SVC
        param.kernel_type = svmutil.LINEAR
        param.eps = 1
        param.probability = 0
        if noprint:
            param.print_func = noprint

        libsvm_model = svmutil.svm_train(prob, param)

        node = convert(libsvm_model, "LibSvmNuSvmcMultiRaw",
                       [('input', FloatTensorType(shape=['None', 2]))])
        self.assertTrue(node is not None)
        X2 = numpy.vstack([X[:2], X[60:62], X[110:112],
                           X[147:149]])  # 5x0, 5x1
        dump_data_and_model(
            X2.astype(numpy.float32),
            SkAPICl(libsvm_model),
            node,
            basename="LibSvmSvmcRaw-Dec3",
            verbose=False,
            allow_failure=
            "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3')")
Ejemplo n.º 11
0
    def test_chi_sq_selector(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
             (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
             (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],
            ["features", "label"])
        selector = ChiSqSelector(numTopFeatures=1,
                                 outputCol="selectedFeatures")
        model = selector.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml ChiSqSelector',
            [('features', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().selectedFeatures.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlChiSqSelector")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['selectedFeatures'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 12
0
    def test_one_vs_rest(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_multiclass_classification_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)
        lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(
            model,
            'Sparkml OneVsRest',
            [('features', FloatTensorType([None, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlOneVsRest")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['prediction'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 13
0
    def convert(self, model, data, args, model_name):
        from skl2onnx import convert_sklearn
        from onnxmltools.convert.common.data_types import FloatTensorType

        self.configure(data, model, args)

        with Timer() as t:
            batch = min(len(data.X_test), self.params["batch_size"])
            remainder = len(data.X_test) % batch
            initial_type = [("input", FloatTensorType([batch, self.params["input_size"]]))]

            self.model = convert_sklearn(model, initial_types=initial_type)
            if remainder > 0:
                initial_type = [("input", FloatTensorType([remainder, self.params["input_size"]]))]
                self.remainder_model = convert_sklearn(model, initial_types=initial_type, target_opset=11)
        return t.interval
Ejemplo n.º 14
0
    def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
             (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
             (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"])
        pca = PCA(k=2, inputCol="features", outputCol="pca_features")
        model = pca.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml PCA',
            [('features', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().pca_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPCA")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['pca_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
 def test_tree_one_class_classification(self):
     features = [[0., 1.], [1., 1.], [2., 0.]]
     features = numpy.array(features, dtype=numpy.float32)
     labels = [1, 1, 1]
     dd = [(labels[i], Vectors.dense(features[i]))
           for i in range(len(labels))]
     data = self.spark.createDataFrame(
         self.spark.sparkContext.parallelize(dd),
         schema=["label", "features"])
     dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
     model = dt.fit(data)
     model_onnx = convert_sparkml(
         model,
         'Sparkml Decision Tree One Class',
         [('features', FloatTensorType([None, 2]))],
         spark_session=self.spark)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     predicted = model.transform(data)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(
             lambda x: pandas.Series(x.toArray())).values.astype(
                 numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlDecisionTreeBinaryClass")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction', 'probability'],
                                            data_np, onnx_model_path)
     compare_results(expected, output, decimal=5)
Ejemplo n.º 16
0
    def test_model_onehot_encoder(self):
        encoder = OneHotEncoderEstimator(inputCols=['index'],
                                         outputCols=['indexVec'])
        data = self.spark.createDataFrame([(0.0, ), (1.0, ), (2.0, ), (2.0, ),
                                           (0.0, ), (2.0, )], ['index'])
        model = encoder.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml OneHotEncoder',
                                     [('index', FloatTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.select("index").toPandas().values.astype(numpy.float32)
        predicted_np = predicted.select("indexVec").toPandas().indexVec.apply(
            lambda x: x.toArray().tolist()).values
        expected = numpy.asarray(
            [x + [0] if numpy.amax(x) == 1 else x + [1] for x in predicted_np])

        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlOneHotEncoder")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexVec'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 17
0
    def do_training(self, X, Y, create_onnx=True):
        print('Pre-processing ...')
        X = self.transform(X, do_fit=True)

        new_columns = {}
        for i, c in enumerate(X.columns):
            new_columns[c] = i
        X = X.rename(columns=new_columns)

        print('Training with ' + str(len(X.columns)) + ' columns ...')
        self.clfs = []
        clf = xgb.XGBClassifier(n_estimators=1700,
                                nthread=32,
                                max_depth=6,
                                learning_rate=0.024,
                                subsample=0.8,
                                colsample_bytree=0.65)
        this_model = clf.fit(X, Y)
        self.clfs.append(clf)

        if create_onnx:
            print('Converting models into ONNX ...')
            onnx_ml_models = []
            for i, clf in enumerate(self.clfs):
                initial_type = [
                    ('dense_input',
                     FloatTensorType([None,
                                      len(self.pipeline.output_columns)]))
                ]
                onnx_ml_models.append(
                    convert_xgboost(clf, initial_types=initial_type))

            self.create_onnx('mental_health', onnx_ml_models)
Ejemplo n.º 18
0
    def test_vector_slicer(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ),
             (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ),
             (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"])
        model = VectorSlicer(inputCol="features",
                             outputCol="sliced",
                             indices=[1, 4])

        feature_count = data.first()[0].array.size
        model_onnx = convert_sparkml(
            model, 'Sparkml VectorSlicer',
            [('features', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().sliced.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlVectorSlicer")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['sliced'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 19
0
 def test_model_vector_indexer_single(self):
     vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed")
     data = self.spark.createDataFrame([(Vectors.dense([-1.0]), ),
                                        (Vectors.dense([0.0]), ),
                                        (Vectors.dense([0.0]), )], ["a"])
     model = vi.fit(data)
     model_onnx = convert_sparkml(
         model,
         'Sparkml VectorIndexer Single',
         [('a', FloatTensorType([None, model.numFeatures]))],
         target_opset=9)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     expected = predicted.toPandas().indexed.apply(
         lambda x: pandas.Series(x.toArray())).values
     data_np = data.toPandas().a.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlVectorIndexerSingle")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['indexed'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Ejemplo n.º 20
0
    def test_convert_svmc(self):
        iris = load_iris()

        X = iris.data[:, :2]
        y = iris.target
        y[y == 2] = 1

        prob = svmutil.svm_problem(y, X.tolist())

        param = svmutil.svm_parameter()
        param.svm_type = SVC
        param.kernel_type = svmutil.RBF
        param.eps = 1
        param.probability = 1
        if noprint:
            param.print_func = noprint

        libsvm_model = svmutil.svm_train(prob, param)

        node = convert(libsvm_model, "LibSvmSvmc",
                       [('input', FloatTensorType())])
        self.assertTrue(node is not None)
        dump_data_and_model(X[:5].astype(numpy.float32),
                            SkAPIClProba2(libsvm_model),
                            node,
                            basename="LibSvmSvmc-Dec2")
    def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([1.2, 3.2, 1.3, -5.6]), ),
             (Vectors.dense([4.3, -3.2, 5.7, 1.0]), ),
             (Vectors.dense([0, 3.2, 4.7, -8.9]), )], ["dense"])
        model = PolynomialExpansion(degree=2,
                                    inputCol="dense",
                                    outputCol="expanded")

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml PolynomialExpansion',
            [('dense', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().expanded.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().dense.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPolynomialExpansion")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['expanded'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 22
0
    def test_convert_svmc_raw(self):
        iris = load_iris()

        X = iris.data[:, :2]
        y = iris.target
        y[y == 2] = 1

        prob = svmutil.svm_problem(y, X.tolist())

        param = svmutil.svm_parameter()
        param.svm_type = SVC
        param.kernel_type = svmutil.RBF
        param.eps = 1
        param.probability = 0
        if noprint:
            param.print_func = noprint

        libsvm_model = svmutil.svm_train(prob, param)

        # known svm runtime dimension error in ONNX Runtime
        node = convert(libsvm_model, "LibSvmSvmcRaw",
                       [('input', FloatTensorType(shape=['None', 'None']))])
        self.assertTrue(node is not None)
        dump_data_and_model(
            X[:5].astype(numpy.float32),
            SkAPICl(libsvm_model),
            node,
            basename="LibSvmSvmcRaw",
            allow_failure=
            "StrictVersion(onnxruntime.__version__) < StrictVersion('0.5.0')")
Ejemplo n.º 23
0
 def test_xgb_classifier_multi_discrete_int_labels(self):
     iris = load_iris()
     x = iris.data[:, :2]
     y = iris.target
     y[y == 0] = 10
     y[y == 1] = 20
     y[y == 2] = -30
     x_train, x_test, y_train, _ = train_test_split(x,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=42)
     xgb = XGBClassifier(n_estimators=3)
     xgb.fit(x_train, y_train)
     conv_model = convert_xgboost(xgb,
                                  initial_types=[
                                      ('input',
                                       FloatTensorType(shape=[None, None]))
                                  ])
     self.assertTrue(conv_model is not None)
     dump_data_and_model(
         x_test.astype("float32"),
         xgb,
         conv_model,
         basename="SklearnXGBClassifierMultiDiscreteIntLabels",
         allow_failure="StrictVersion("
         "onnx.__version__)"
         "< StrictVersion('1.3.0')",
     )
Ejemplo n.º 24
0
    def test_model_binarizer(self):
        data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                          ["id", "feature"])
        model = Binarizer(inputCol='feature', outputCol='binarized')

        # the input name should match that of what StringIndexer.inputCol
        model_onnx = convert_sparkml(model, 'Sparkml Binarizer',
                                     [('feature', FloatTensorType([None, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("binarized").toPandas().values.astype(
            numpy.float32)
        data_np = data.select('feature').toPandas().values.astype(
            numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBinarizer")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['binarized'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 25
0
def calculate_sparkml_scaler_output_shapes(operator):
    check_input_and_output_numbers(operator, output_count_range=1)
    check_input_and_output_types(
        operator, good_input_types=[FloatTensorType, Int64TensorType])

    input_shape = copy.deepcopy(operator.inputs[0].type.shape)
    operator.outputs[0].type = FloatTensorType(input_shape)
Ejemplo n.º 26
0
 def test_xgboost_classifier_i5450(self):
     iris = load_iris()
     X, y = iris.data, iris.target
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         random_state=10)
     clr = XGBClassifier(objective="multi:softmax",
                         max_depth=1,
                         n_estimators=2)
     clr.fit(X_train,
             y_train,
             eval_set=[(X_test, y_test)],
             early_stopping_rounds=40)
     initial_type = [('float_input', FloatTensorType([None, 4]))]
     onx = convert_xgboost(clr, initial_types=initial_type)
     sess = InferenceSession(onx.SerializeToString())
     input_name = sess.get_inputs()[0].name
     label_name = sess.get_outputs()[1].name
     predict_list = [1., 20., 466., 0.]
     predict_array = np.array(predict_list).reshape(
         (1, -1)).astype(np.float32)
     pred_onx = sess.run([label_name], {input_name: predict_array})[0]
     pred_xgboost = sessresults = clr.predict_proba(predict_array)
     bst = clr.get_booster()
     bst.dump_model('dump.raw.txt')
     dump_data_and_model(
         X_test.astype(np.float32) + 1e-5,
         clr,
         onx,
         allow_failure=
         "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
         basename="XGBClassifierIris")
Ejemplo n.º 27
0
    def test_random_forrest_regression(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register(
            "truncateFeatures",
            lambda x: SparseVector(feature_count, range(0, feature_count),
                                   x.toArray()[125:130]), VectorUDT())
        data = original_data.selectExpr(
            "cast(label as string) as label",
            "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label",
                                      outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features",
                                        outputCol="indexedFeatures",
                                        maxCategories=10,
                                        handleInvalid='error')

        rf = RandomForestRegressor(labelCol="indexedLabel",
                                   featuresCol="indexedFeatures",
                                   numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(
            model,
            'Sparkml RandomForest Regressor',
            [('label', StringTensorType([1, 1])),
             ('features', FloatTensorType([1, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data.limit(1))
        data_np = {
            'label':
            data.limit(1).toPandas().label.values,
            'features':
            data.limit(1).toPandas().features.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlRandomForestRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'],
                                               data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Ejemplo n.º 28
0
    def common_test_xgboost_10_skl(self, missing, replace=False):
        this = os.path.abspath(os.path.dirname(__file__))
        data = os.path.join(this, "data_fail.csv")
        data = pandas.read_csv(data)

        for col in data:
            dtype = data[col].dtype
            if dtype in ['float64', 'float32']:
                data[col].fillna(0., inplace=True)
            if dtype in ['int64']:
                data[col].fillna(0, inplace=True)
            elif dtype in ['O']:
                data[col].fillna('N/A', inplace=True)

        data['pclass'] = data['pclass'] * float(1)
        full_df = data.drop('survived', axis=1)
        full_labels = data['survived']

        train_df, test_df, train_labels, test_labels = train_test_split(
            full_df, full_labels, test_size=.2, random_state=11)

        col_transformer = self._column_tranformer_fitted_from_df(full_df)

        param_distributions = {
            "colsample_bytree": 0.5,
            "gamma": 0.2,
            'learning_rate': 0.3,
            'max_depth': 2,
            'min_child_weight': 1.,
            'n_estimators': 1,
            'missing': missing,
        }

        regressor = XGBRegressor(verbose=0,
                                 objective='reg:squarederror',
                                 **param_distributions)
        regressor.fit(col_transformer.transform(train_df), train_labels)
        model = Pipeline(steps=[('preprocessor',
                                 col_transformer), ('regressor', regressor)])

        update_registered_converter(XGBRegressor, 'XGBRegressor',
                                    calculate_linear_regressor_output_shapes,
                                    convert_xgb)

        # last step
        input_xgb = model.steps[0][-1].transform(test_df[:5]).astype(
            np.float32)
        if replace:
            input_xgb[input_xgb[:, :] == missing] = np.nan
        onnx_last = convert_sklearn(
            model.steps[1][-1],
            initial_types=[
                ('X', FloatTensorType(shape=[None, input_xgb.shape[1]]))
            ],
            target_opset=get_opset_number_from_onnx())
        session = rt.InferenceSession(onnx_last.SerializeToString())
        pred_skl = model.steps[1][-1].predict(input_xgb).ravel()
        pred_onx = session.run(None, {'X': input_xgb})[0].ravel()
        assert_almost_equal(pred_skl, pred_onx)
 def test_max_abs_scaler(self):
     model = MaxAbsScaler()
     data = [[0., 0., 3.], [1., 1., 0.], [0., 2., 1.], [1., 0., 2.]]
     model.fit(data)
     model_onnx = convert_sklearn(model, 'scaler', [('input', FloatTensorType([1, 3]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(numpy.array(data, dtype=numpy.float32),
                         model, basename="SklearnMaxAbsScaler")
Ejemplo n.º 30
0
    def test_xgboost_unpickle_06(self):
        # Unpickle a model trained with an old version of xgboost.
        this = os.path.dirname(__file__)
        with open(os.path.join(this, "xgboost10day.pickle.dat"), "rb") as f:
            xgb = pickle.load(f)

        conv_model = convert_xgboost(xgb, initial_types=[('features', FloatTensorType([1, 10000]))])
        assert conv_model is not None