コード例 #1
0
ファイル: test_imputer.py プロジェクト: xadupre/onnxmltools
    def _imputer_test_multi(self):
        data = self.spark.createDataFrame([(1.0, float("nan")),
                                           (2.0, float("nan")),
                                           (float("nan"), 3.0), (4.0, 4.0),
                                           (5.0, 5.0)], ["a", "b"])
        imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"])
        model = imputer.fit(data)

        # the input name should match the inputCols above
        model_onnx = convert_sparkml(model, 'Sparkml Imputer Multi Input',
                                     [('a', FloatTensorType([None, 1])),
                                      ('b', FloatTensorType([None, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("out_a", "out_b").toPandas().values.astype(
            numpy.float32)
        data_np = data.toPandas().values.astype(numpy.float32)
        data_np = {'a': data_np[:, :1], 'b': data_np[:, 1:]}
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlImputerMulti")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['out_a', 'out_b'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #2
0
 def test_model_vector_assembler(self):
     col_names = ["a", "b", "c"]
     model = VectorAssembler(inputCols=col_names, outputCol='features')
     data = self.spark.createDataFrame([(1., 0., 3.)], col_names)
     model_onnx = convert_sparkml(model, 'Sparkml VectorAssembler',
                                  [('a', FloatTensorType([None, 1])),
                                   ('b', FloatTensorType([None, 1])),
                                   ('c', FloatTensorType([None, 1]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     predicted = model.transform(data)
     expected = predicted.select("features").toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values
     data_np = {
         'a': data.select('a').toPandas().values.astype(numpy.float32),
         'b': data.select('b').toPandas().values.astype(numpy.float32),
         'c': data.select('c').toPandas().values.astype(numpy.float32)
     }
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlVectorAssembler")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['features'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
コード例 #3
0
def calculate_sparkml_naive_bayes_output_shapes(operator):
    check_input_and_output_numbers(operator, output_count_range=2)
    check_input_and_output_types(operator,
                                 good_input_types=[FloatTensorType],
                                 good_output_types=[FloatTensorType,FloatTensorType])
    N = operator.inputs[0].type.shape[0]
    C = operator.raw_operator.numClasses
    operator.outputs[0].type = FloatTensorType([N, 1])
    operator.outputs[1].type = FloatTensorType([N, C])
コード例 #4
0
    def test_combine_inputs(self):
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline

        scaler = StandardScaler()
        scaler.fit([[0., 0.], [0., 0.], [1., 1.], [1., 1.]])
        model = Pipeline([('scaler1', scaler), ('scaler2', scaler)])

        model_onnx = convert_sklearn(model, 'pipeline',
                                     [('input1', FloatTensorType([1, 1])),
                                      ('input2', FloatTensorType([1, 1]))])
        self.assertTrue(len(model_onnx.graph.node[-1].output) == 1)
        self.assertTrue(model_onnx is not None)
コード例 #5
0
    def test_xgboost_booster_classifier_reg(self):
        x, y = make_classification(n_classes=2,
                                   n_features=5,
                                   n_samples=100,
                                   random_state=42,
                                   n_informative=3)
        y = y.astype(np.float32) + 0.567
        x_train, x_test, y_train, _ = train_test_split(x,
                                                       y,
                                                       test_size=0.5,
                                                       random_state=42)

        data = DMatrix(x_train, label=y_train)
        model = train(
            {
                'objective': 'reg:squarederror',
                'n_estimators': 3,
                'min_child_samples': 1
            }, data)
        model_onnx = convert_xgboost(
            model, 'tree-based classifier',
            [('input', FloatTensorType([None, x.shape[1]]))])
        dump_data_and_model(
            x_test.astype(np.float32),
            model,
            model_onnx,
            allow_failure=
            "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
            basename="XGBBoosterReg")
コード例 #6
0
    def test_one_vs_rest(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)
        lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlOneVsRest")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #7
0
 def test_xgb_classifier_multi_discrete_int_labels(self):
     iris = load_iris()
     x = iris.data[:, :2]
     y = iris.target
     y[y == 0] = 10
     y[y == 1] = 20
     y[y == 2] = -30
     x_train, x_test, y_train, _ = train_test_split(x,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=42)
     xgb = XGBClassifier(n_estimators=3)
     xgb.fit(x_train, y_train)
     conv_model = convert_xgboost(xgb,
                                  initial_types=[
                                      ('input',
                                       FloatTensorType(shape=[None, None]))
                                  ])
     self.assertTrue(conv_model is not None)
     dump_data_and_model(
         x_test.astype("float32"),
         xgb,
         conv_model,
         basename="SklearnXGBClassifierMultiDiscreteIntLabels",
         allow_failure="StrictVersion("
         "onnx.__version__)"
         "< StrictVersion('1.3.0')",
     )
コード例 #8
0
    def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
             (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
             (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"])
        pca = PCA(k=2, inputCol="features", outputCol="pca_features")
        model = pca.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(
            model, 'Sparkml PCA',
            [('features', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().pca_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPCA")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['pca_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #9
0
    def do_training(self, X, Y, create_onnx=True):
        print('Pre-processing ...')
        X = self.transform(X, do_fit=True)

        new_columns = {}
        for i, c in enumerate(X.columns):
            new_columns[c] = i
        X = X.rename(columns=new_columns)

        print('Training with ' + str(len(X.columns)) + ' columns ...')
        self.clfs = []
        clf = xgb.XGBClassifier(n_estimators=1700,
                                nthread=32,
                                max_depth=6,
                                learning_rate=0.024,
                                subsample=0.8,
                                colsample_bytree=0.65)
        xgb_model = clf.fit(X, Y, eval_metric="auc", verbose=True)
        self.clfs.append(clf)

        if create_onnx:
            print('Converting models into ONNX ...')
            onnx_ml_models = []
            for i, clf in enumerate(self.clfs):
                initial_type = [
                    ('dense_input',
                     FloatTensorType([None,
                                      len(self.pipeline.output_columns)]))
                ]
                onnx_ml_models.append(
                    convert_xgboost(clf, initial_types=initial_type))

            self.create_onnx('insurance', onnx_ml_models)
コード例 #10
0
    def test_xgboost_example_mnist(self):
        """
        Train a simple xgboost model and store associated artefacts.
        """
        X, y = load_digits(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        X_train = X_train.reshape((X_train.shape[0], -1))
        X_test = X_test.reshape((X_test.shape[0], -1))

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        clf = XGBClassifier(objective="multi:softprob", n_jobs=-1)
        clf.fit(X_train, y_train)

        sh = [None, X_train.shape[1]]
        onnx_model = convert_xgboost(clf,
                                     initial_types=[('input',
                                                     FloatTensorType(sh))])

        dump_data_and_model(
            X_test.astype(np.float32),
            clf,
            onnx_model,
            allow_failure=
            "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
            basename="XGBoostExample")
コード例 #11
0
    def test_xgb_regressor(self):
        this = os.path.dirname(__file__)
        df = pandas.read_csv(os.path.join(this, "data_fail_empty.csv"))
        X, y = df.drop('y', axis=1), df['y']
        X_train, X_test, y_train, y_test = train_test_split(X, y)

        clr = XGBClassifier(max_delta_step=0,
                            tree_method='hist',
                            n_estimators=100,
                            booster='gbtree',
                            objective='binary:logistic',
                            eval_metric='logloss',
                            learning_rate=0.1,
                            gamma=10,
                            max_depth=7,
                            min_child_weight=50,
                            subsample=0.75,
                            colsample_bytree=0.75,
                            random_state=42,
                            verbosity=0)

        clr.fit(X_train,
                y_train,
                eval_set=[(X_test, y_test)],
                early_stopping_rounds=40)

        initial_type = [('float_input', FloatTensorType([None, 797]))]
        onx = convert_xgboost(clr, initial_types=initial_type)
        expected = clr.predict(X_test), clr.predict_proba(X_test)
        sess = InferenceSession(onx.SerializeToString())
        X_test = X_test.values.astype(np.float32)
        got = sess.run(None, {'float_input': X_test})
        assert_almost_equal(expected[1], got[1])
        assert_almost_equal(expected[0], got[0])
コード例 #12
0
    def test_convert_svmc_linear_raw_multi(self):
        iris = load_iris()

        X = iris.data[:, :2]
        y = iris.target
        y[-5:] = 3

        prob = svmutil.svm_problem(y, X.tolist())

        param = svmutil.svm_parameter()
        param.svm_type = SVC
        param.kernel_type = svmutil.LINEAR
        param.eps = 1
        param.probability = 0
        if noprint:
            param.print_func = noprint

        libsvm_model = svmutil.svm_train(prob, param)

        node = convert(libsvm_model, "LibSvmNuSvmcMultiRaw",
                       [('input', FloatTensorType(shape=['None', 2]))])
        self.assertTrue(node is not None)
        X2 = numpy.vstack([X[:2], X[60:62], X[110:112],
                           X[147:149]])  # 5x0, 5x1
        dump_data_and_model(
            X2.astype(numpy.float32),
            SkAPICl(libsvm_model),
            node,
            basename="LibSvmSvmcRaw-Dec3",
            verbose=False,
            allow_failure=
            "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3')")
コード例 #13
0
    def test_model_binarizer(self):
        import numpy
        data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                          ["id", "feature"])
        model = Binarizer(inputCol='feature', outputCol='binarized')

        # the input name should match that of what StringIndexer.inputCol
        model_onnx = convert_sparkml(model, 'Sparkml Binarizer',
                                     [('feature', FloatTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("binarized").toPandas().values.astype(
            numpy.float32)
        data_np = data.select('feature').toPandas().values.astype(
            numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBinarizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['binarized'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #14
0
    def test_convert_nusvmc(self):
        iris = load_iris()

        X = iris.data[:, :2]
        y = iris.target
        y[y == 2] = 1

        prob = svmutil.svm_problem(y, X.tolist())

        param = svmutil.svm_parameter()
        param.svm_type = NuSVC
        param.kernel_type = svmutil.RBF
        param.eps = 1
        param.probability = 1
        if noprint:
            param.print_func = noprint

        libsvm_model = svmutil.svm_train(prob, param)

        node = convert(libsvm_model, "LibSvmNuSvmc",
                       [('input', FloatTensorType(shape=['None', 'None']))])
        self.assertTrue(node is not None)
        dump_data_and_model(
            X[:5].astype(numpy.float32),
            SkAPIClProba2(libsvm_model),
            node,
            basename="LibSvmNuSvmc-Dec2",
            allow_failure=
            "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3')")
コード例 #15
0
    def test_convert_svmc_raw(self):
        iris = load_iris()

        X = iris.data[:, :2]
        y = iris.target
        y[y == 2] = 1

        prob = svmutil.svm_problem(y, X.tolist())

        param = svmutil.svm_parameter()
        param.svm_type = SVC
        param.kernel_type = svmutil.RBF
        param.eps = 1
        param.probability = 0
        if noprint:
            param.print_func = noprint

        libsvm_model = svmutil.svm_train(prob, param)

        # known svm runtime dimension error in ONNX Runtime
        node = convert(libsvm_model, "LibSvmSvmcRaw",
                       [('input', FloatTensorType(shape=['None', 'None']))])
        self.assertTrue(node is not None)
        dump_data_and_model(
            X[:5].astype(numpy.float32),
            SkAPICl(libsvm_model),
            node,
            basename="LibSvmSvmcRaw",
            allow_failure=
            "StrictVersion(onnxruntime.__version__) < StrictVersion('0.5.0')")
コード例 #16
0
    def test_standard_scaler(self):
        data = self.spark.createDataFrame([(
            0,
            Vectors.dense([1.0, 0.1, -1.0]),
        ), (
            1,
            Vectors.dense([2.0, 1.1, 1.0]),
        ), (
            2,
            Vectors.dense([3.0, 10.1, 3.0]),
        )], ["id", "features"])
        scaler = StandardScaler(inputCol='features',
                                outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(model, 'Sparkml StandardScaler',
                                     [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlStandardScaler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #17
0
    def test_element_wise_product(self):
        data = self.spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), )],
                                          ["features"])
        model = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
                                   inputCol="features",
                                   outputCol="eprod")
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml ElementwiseProduct',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().eprod.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        ]
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlElementwiseProduct")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['eprod'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #18
0
    def test_dct(self):
        data = self.spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]), )],
                                          ["vec"])
        model = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
        # the input name should match that of what inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(
            model, 'Sparkml DCT',
            [('vec', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().resultVec.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().vec.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlDCT")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['resultVec'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #19
0
 def test_xgboost_classifier_i5450(self):
     iris = load_iris()
     X, y = iris.data, iris.target
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         random_state=10)
     clr = XGBClassifier(objective="multi:softmax",
                         max_depth=1,
                         n_estimators=2)
     clr.fit(X_train,
             y_train,
             eval_set=[(X_test, y_test)],
             early_stopping_rounds=40)
     initial_type = [('float_input', FloatTensorType([None, 4]))]
     onx = convert_xgboost(clr, initial_types=initial_type)
     sess = InferenceSession(onx.SerializeToString())
     input_name = sess.get_inputs()[0].name
     label_name = sess.get_outputs()[1].name
     predict_list = [1., 20., 466., 0.]
     predict_array = np.array(predict_list).reshape(
         (1, -1)).astype(np.float32)
     pred_onx = sess.run([label_name], {input_name: predict_array})[0]
     pred_xgboost = sessresults = clr.predict_proba(predict_array)
     bst = clr.get_booster()
     bst.dump_model('dump.raw.txt')
     dump_data_and_model(
         X_test.astype(np.float32) + 1e-5,
         clr,
         onx,
         allow_failure=
         "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
         basename="XGBClassifierIris")
コード例 #20
0
 def test_lightgbm_booster_multi_classifier(self):
     X = [[0, 1], [1, 1], [2, 0], [1, 2], [-1, 2], [1, -2]]
     X = numpy.array(X, dtype=numpy.float32)
     y = [0, 1, 0, 1, 2, 2]
     data = lightgbm.Dataset(X, label=y)
     model = lightgbm.train(
         {
             'boosting_type': 'gbdt',
             'objective': 'multiclass',
             'n_estimators': 3,
             'min_child_samples': 1,
             'num_class': 3
         }, data)
     model_onnx, prefix = convert_model(
         model, 'tree-based classifier',
         [('input', FloatTensorType([None, 2]))])
     dump_data_and_model(
         X,
         model,
         model_onnx,
         allow_failure=
         "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
         basename=prefix + "BoosterBin" + model.__class__.__name__)
     try:
         from onnxruntime import InferenceSession
     except ImportError:
         # onnxruntime not installed (python 2.7)
         return
     sess = InferenceSession(model_onnx.SerializeToString())
     out = sess.get_outputs()
     names = [o.name for o in out]
     assert names == ['label', 'probabilities']
コード例 #21
0
 def test_xgb_regressor(self):
     iris = load_diabetes()
     x = iris.data
     y = iris.target
     x_train, x_test, y_train, _ = train_test_split(x,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=42)
     xgb = XGBRegressor()
     xgb.fit(x_train, y_train)
     conv_model = convert_xgboost(xgb,
                                  initial_types=[
                                      ('input',
                                       FloatTensorType(shape=[None, None]))
                                  ])
     self.assertTrue(conv_model is not None)
     dump_data_and_model(
         x_test.astype("float32"),
         xgb,
         conv_model,
         basename="SklearnXGBRegressor-Dec3",
         allow_failure="StrictVersion("
         "onnx.__version__)"
         "< StrictVersion('1.3.0')",
     )
コード例 #22
0
 def test_model_linear_regression_basic(self):
     data = self.spark.createDataFrame(
         [(1.0, 2.0, Vectors.dense(1.0)),
          (0.0, 2.0, Vectors.sparse(1, [], []))],
         ["label", "weight", "features"])
     lr = LinearRegression(maxIter=5,
                           regParam=0.0,
                           solver="normal",
                           weightCol="weight")
     model = lr.fit(data)
     # the name of the input is 'features'
     C = model.numFeatures
     model_onnx = convert_sparkml(
         model, 'sparkml LinearRegressorBasic',
         [('features', FloatTensorType([None, C]))])
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlLinearRegressor_Basic")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
コード例 #23
0
    def _test_scaler_converter(self, model):
        warnings.filterwarnings("ignore")
        X = np.array([[0.0, 0.0, 3.0], [1.0, -1.0, 0.0], [0.0, 2.0, 1.0],
                      [1.0, 0.0, -2.0]],
                     dtype=np.float32)
        model.fit(X)

        # Create ONNX-ML model
        onnx_ml_model = convert_sklearn(
            model,
            initial_types=[("float_input", FloatTensorType([None,
                                                            X.shape[1]]))])

        # Create ONNX model by calling converter
        onnx_model = convert(onnx_ml_model, "onnx", X)
        # Get the predictions for the ONNX-ML model
        session = ort.InferenceSession(onnx_ml_model.SerializeToString())
        output_names = [
            session.get_outputs()[i].name
            for i in range(len(session.get_outputs()))
        ]
        inputs = {session.get_inputs()[0].name: X}
        onnx_ml_pred = session.run(output_names, inputs)[0]

        # Get the predictions for the ONNX model
        onnx_pred = onnx_model.transform(X)

        return onnx_ml_pred, onnx_pred
コード例 #24
0
 def test_gbt_classifier(self):
     raw_data = self.spark.createDataFrame(
         [(1.0, Vectors.dense(1.0)),
          (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
     si_model = string_indexer.fit(raw_data)
     data = si_model.transform(raw_data)
     gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml GBT Classifier',
         [('features', FloatTensorType([None, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(
             lambda x: pandas.Series(x.toArray())).values.astype(
                 numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlGBTClassifier")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction', 'probability'],
                                            data_np, onnx_model_path)
     compare_results(expected, output, decimal=5)
コード例 #25
0
 def test_decision_tree_regressor(self):
     features = [[0, 1], [1, 1], [2, 0]]
     features = numpy.array(features, dtype=numpy.float32)
     labels = [100, -10, 50]
     dd = [(labels[i], Vectors.dense(features[i]))
           for i in range(len(labels))]
     data = self.spark.createDataFrame(
         self.spark.sparkContext.parallelize(dd),
         schema=["label", "features"])
     dt = DecisionTreeRegressor(labelCol="label", featuresCol="features")
     model = dt.fit(data)
     feature_count = data.select('features').first()[0].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml Decision Tree Regressor',
         [('features', FloatTensorType([None, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     predicted = model.transform(data)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlDecisionTreeRegressor")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
コード例 #26
0
    def test_convert_svmc(self):
        iris = load_iris()

        X = iris.data[:, :2]
        y = iris.target
        y[y == 2] = 1

        prob = svmutil.svm_problem(y, X.tolist())

        param = svmutil.svm_parameter()
        param.svm_type = SVC
        param.kernel_type = svmutil.RBF
        param.eps = 1
        param.probability = 1
        if noprint:
            param.print_func = noprint

        libsvm_model = svmutil.svm_train(prob, param)

        node = convert(libsvm_model, "LibSvmSvmc",
                       [('input', FloatTensorType())])
        self.assertTrue(node is not None)
        dump_data_and_model(X[:5].astype(numpy.float32),
                            SkAPIClProba2(libsvm_model),
                            node,
                            basename="LibSvmSvmc-Dec2")
コード例 #27
0
    def do_training(self, X, Y, create_onnx=True):
        print('Pre-processing ...')
        X = self.transform(X, do_fit=True)

        print('Training ...')
        C = 0.12
        self.clfs = []
        clf = LogisticRegression(C=C,
                                 solver='lbfgs',
                                 max_iter=1000,
                                 verbose=1,
                                 n_jobs=32)
        clf.fit(X, Y)

        self.clfs.append(clf)

        if create_onnx:
            print('Converting models into ONNX ...')
            onnx_ml_models = []
            for i, clf in enumerate(self.clfs):
                initial_type = [
                    ('dense_input',
                     FloatTensorType([None,
                                      len(self.pipeline.output_columns)]))
                ]
                onnx_ml_models.append(
                    convert_sklearn(clf,
                                    initial_types=initial_type,
                                    options={type(clf): {
                                                 'zipmap': False
                                             }}))

            self.create_onnx('categorical-encoding', onnx_ml_models)
コード例 #28
0
 def test_gbt_regressor(self):
     data = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                        (0.0, Vectors.sparse(1, [], []))],
                                       ["label", "features"])
     gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml GBTRegressor',
         [('features', FloatTensorType([1, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlGBTRegressor")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
コード例 #29
0
    def test_model_generalized_linear_regression(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_linear_regression_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)

        lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        model = lr.fit(data)
        # the name of the input is 'features'
        C = model.numFeatures
        model_onnx = convert_sparkml(
            model, 'sparkml GeneralizedLinearRegression',
            [('features', FloatTensorType([None, C]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlGeneralizedLinearRegression")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['prediction'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #30
0
    def test_bucketizer(self):
        values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )]
        data = self.spark.createDataFrame(values, ["features"])
        model = Bucketizer(splits=[-float("inf"), 0.5, 1.4,
                                   float("inf")],
                           inputCol="features",
                           outputCol="buckets")

        feature_count = len(data.select('features').first())
        model_onnx = convert_sparkml(
            model, 'Sparkml Bucketizer',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.setHandleInvalid("error").transform(data)
        expected = predicted.select("buckets").toPandas().values.astype(
            numpy.float32)
        data_np = [data.toPandas().values.astype(numpy.float32)]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBucketizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['buckets'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)