Beispiel #1
0
 def _test_single_output_core(self, model):
     X = [[0, 1], [1, 1], [2, 0]]
     y = [100, -10, 50]
     model.fit(X, y)
     model_onnx = convert_sklearn(model, 'tree-based regressor',
                                  [('input', Int64TensorType([1, 2]))])
     self.assertTrue(model_onnx is not None)
Beispiel #2
0
    def test_index_to_string(self):
        original_data = self.spark.createDataFrame([(0, "a"), (1, "b"),
                                                    (2, "c"), (3, "a"),
                                                    (4, "a"), (5, "c")],
                                                   ["id", "category"])
        string_indexer = StringIndexer(inputCol="category",
                                       outputCol="categoryIndex")
        string_indexer_model = string_indexer.fit(original_data)
        data = string_indexer_model.transform(original_data)

        model = IndexToString(inputCol="categoryIndex",
                              outputCol="originalCategory",
                              labels=['A', 'B', 'C'])
        # the input name should match that of what IndexToString.inputCol
        model_onnx = convert_sparkml(
            model, 'Sparkml IndexToString',
            [('categoryIndex', Int64TensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        expected = predicted.select("originalCategory").toPandas().values
        data_np = data.select('categoryIndex').toPandas().values.astype(
            numpy.int64)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlIndexToString")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['originalCategory'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Beispiel #3
0
 def test_model_one_hot_encoder(self):
     # categorical_features will be removed in 0.22 (this test will fail by then).
     model = OneHotEncoder()
     model.fit([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]])
     model_onnx = convert_sklearn(model, 'scikit-learn one-hot encoder',
                                  [('input', Int64TensorType([1, 3]))])
     self.assertTrue(model_onnx is not None)
Beispiel #4
0
 def _test_binary_classification_core(self, model):
     X = [[0, 1], [1, 1], [2, 0]]
     y = ['A', 'B', 'A']
     model.fit(X, y)
     model_onnx = convert_sklearn(model, 'tree-based binary classifier',
                                  [('input', Int64TensorType([1, 2]))])
     self.assertTrue(model_onnx is not None)
Beispiel #5
0
def calculate_one_vs_rest_output_shapes(operator):
    check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1)
    check_input_and_output_types(operator, good_input_types=[FloatTensorType, Int64TensorType])
    if len(operator.inputs[0].type.shape) != 2:
        raise RuntimeError('Input must be a [N, C]-tensor')

    N = operator.inputs[0].type.shape[0]
    operator.outputs[0].type = Int64TensorType(shape=[N])
 def test_standard_scaler(self):
     model = StandardScaler()
     data = [[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]
     model.fit(data)
     model_onnx = convert_sklearn(model, 'scaler', [('input', Int64TensorType([1, 3]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(numpy.array(data, dtype=numpy.int64),
                         model, model_onnx, basename="SklearnStandardScalerInt64")
Beispiel #7
0
 def test_one_hot_encoder_mixed_float_int(self):
     # categorical_features will be removed in 0.22 (this test will fail by then).
     model = OneHotEncoder()
     model.fit([[0.4, 0.2, 3], [1.4, 1.2, 0], [0.2, 2.2, 1]])
     model_onnx = convert_sklearn(model,
                                  'one-hot encoder mixed-type inputs',
                                  [('input1', FloatTensorType([1, 2])),
                                   ('input2', Int64TensorType([1, 1]))])
     self.assertTrue(model_onnx is not None)
Beispiel #8
0
 def test_one_hot_encoder_mixed_string_int(self):
     # categorical_features will be removed in 0.22 (this test will fail by then).
     data = [["0.4", "0.2", 3], ["1.4", "1.2", 0], ["0.2", "2.2", 1]]
     model = OneHotEncoder(categories='auto')        
     model.fit(data)
     inputs = [('input1', StringTensorType([1, 2])), ('input2', Int64TensorType([1, 1]))]
     model_onnx = convert_sklearn(model, 'one-hot encoder mixed-type inputs', inputs)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(data, model, model_onnx, basename="SklearnOneHotEncoderStringInt64",
                         allow_failure=True)
Beispiel #9
0
 def test_model_imputer(self):
     model = Imputer(missing_values='NaN', strategy='mean', axis=0)
     data = [[1, 2], [np.nan, 3], [7, 6]]
     model.fit(data)
     # The conversion works but internally scikit-learn converts
     # everything into float before looking into missing values.
     # There is no nan integer. The runtime is not tested
     # in this case.
     model_onnx = convert_sklearn(model, 'scikit-learn imputer',
                                  [('input', Int64TensorType([1, 2]))])
     self.assertTrue(model_onnx is not None)
Beispiel #10
0
    def test_imputer_int_inputs(self):
        model = Imputer(missing_values='NaN', strategy='mean', axis=0)
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)
        model_onnx = convert_sklearn(model, 'scikit-learn imputer',
                                     [('input', Int64TensorType([1, 2]))])
        self.assertEqual(len(model_onnx.graph.node), 2)

        # Last node should be Imputer
        outputs = model_onnx.graph.output
        self.assertEqual(len(outputs), 1)
        self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value,
                         2)
Beispiel #11
0
    def test_combine_inputs_floats_ints(self):
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline

        scaler = StandardScaler()
        scaler.fit([[0, 0.], [0, 0.], [1, 1.], [1, 1.]])
        model = Pipeline([('scaler1', scaler), ('scaler2', scaler)])

        model_onnx = convert_sklearn(model, 'pipeline',
                                     [('input1', Int64TensorType([1, 1])),
                                      ('input2', FloatTensorType([1, 1]))])
        self.assertTrue(len(model_onnx.graph.node[-1].output) == 1)
        self.assertTrue(model_onnx is not None)
def calculate_decision_tree_classifier_output_shapes(operator):
    check_input_and_output_numbers(operator,
                                   input_count_range=1,
                                   output_count_range=[1, 2])
    check_input_and_output_types(
        operator, good_input_types=[FloatTensorType, Int64TensorType])
    if len(operator.inputs[0].type.shape) != 2:
        raise RuntimeError('Input must be a [N, C]-tensor')

    N = operator.inputs[0].type.shape[0]

    class_count = operator.raw_operator.numClasses
    operator.outputs[0].type = Int64TensorType(shape=[N])
    operator.outputs[1].type = FloatTensorType([N, class_count])
Beispiel #13
0
 def test_model_one_hot_encoder(self):
     # categorical_features will be removed in 0.22 (this test will fail by then).
     # FutureWarning: The handling of integer data will change in version 0.22.
     # Currently, the categories are determined based on the range [0, max(values)],
     # while in the future they will be determined based on the unique values.
     # If you want the future behaviour and silence this warning,
     # you can specify "categories='auto'".
     model = OneHotEncoder()
     data = numpy.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=numpy.int64)
     model.fit(data)
     model_onnx = convert_sklearn(model, 'scikit-learn one-hot encoder',
                                  [('input', Int64TensorType([1, 3]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(data, model, model_onnx, basename="SklearnOneHotEncoderInt64-SkipDim1")
Beispiel #14
0
def calculate_gbt_classifier_output_shapes(operator):
    check_input_and_output_numbers(operator,
                                   input_count_range=1,
                                   output_count_range=[1, 2])
    check_input_and_output_types(
        operator, good_input_types=[FloatTensorType, Int64TensorType])
    if len(operator.inputs[0].type.shape) != 2:
        raise RuntimeError('Input must be a [N, C]-tensor')

    N = operator.inputs[0].type.shape[0]
    operator.outputs[0].type = Int64TensorType(shape=[N])
    if isinstance(operator.raw_operator, GBTClassificationModel):
        class_count = 2
        operator.outputs[1].type = FloatTensorType([N, class_count])
Beispiel #15
0
    def test_index_to_string_throws(self):
        original_data = self.spark.createDataFrame([(0, "a"), (1, "b"),
                                                    (2, "c"), (3, "a"),
                                                    (4, "a"), (5, "c")],
                                                   ["id", "category"])
        string_indexer = StringIndexer(inputCol="category",
                                       outputCol="categoryIndex")
        string_indexer_model = string_indexer.fit(original_data)
        data = string_indexer_model.transform(original_data)

        model = IndexToString(inputCol="categoryIndex",
                              outputCol="originalCategory")
        # the input name should match that of what IndexToString.inputCol
        model_onnx = None
        with pytest.raises(SparkMlConversionError):
            model_onnx = convert_sparkml(
                model, 'Sparkml IndexToString',
                [('categoryIndex', Int64TensorType([1, 1]))])
Beispiel #16
0
def calculate_logistic_regression_output_shapes(operator):
    '''
     This operator maps an input feature vector into a scalar label if the number of outputs is one. If two outputs
     appear in this operator's output list, we should further generate a map storing all classes' probabilities.

     Allowed input/output patterns are
         1. [N, C] ---> [N, 1], A sequence of map

     '''
    class_count = operator.raw_operator.numClasses
    check_input_and_output_numbers(operator,
                                   input_count_range=1,
                                   output_count_range=[1, class_count])
    check_input_and_output_types(
        operator, good_input_types=[FloatTensorType, Int64TensorType])
    if len(operator.inputs[0].type.shape) != 2:
        raise RuntimeError('Input must be a [N, C]-tensor')

    N = operator.inputs[0].type.shape[0]

    operator.outputs[0].type = Int64TensorType(shape=[N])
    operator.outputs[1].type = FloatTensorType([N, class_count])
    def test_combine_inputs_floats_ints(self):
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline

        data = [[0, 0.], [0, 0.], [1, 1.], [1, 1.]]
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([('scaler1', scaler), ('scaler2', scaler)])

        model_onnx = convert_sklearn(model, 'pipeline',
                                     [('input1', Int64TensorType([1, 1])),
                                      ('input2', FloatTensorType([1, 1]))])
        self.assertTrue(len(model_onnx.graph.node[-1].output) == 1)
        self.assertTrue(model_onnx is not None)
        data = numpy.array(data)
        data = {
            'input1': data[:, 0].astype(numpy.int64),
            'input2': data[:, 1].astype(numpy.float32)
        }
        dump_data_and_model(data,
                            PipeConcatenateInput(model),
                            model_onnx,
                            basename="SklearnPipelineScalerMixed-OneOff")
 def test_model_normalizer(self):
     model = Normalizer(norm='l2')
     model_onnx = convert_sklearn(model, 'scikit-learn normalizer',
                                  [('input', Int64TensorType([1, 1]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(len(model_onnx.graph.node) == 1)
####################################
# Conversion to ONNX format
# +++++++++++++++++++++++++
#
# We use module
# `onnxmltools <https://github.com/onnx/onnxmltools>`_
# to convert the model into ONNX format.

from onnxmltools import convert_sklearn
from onnxmltools.utils import save_model
from onnxmltools.convert.common.data_types import FloatTensorType, Int64TensorType, DictionaryType, SequenceType

# initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
initial_type = [('float_input',
                 DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
onx = convert_sklearn(pipe, initial_types=initial_type)
save_model(onx, "pipeline_vectorize.onnx")

##################################
# We load the model with ONNX Runtime and look at
# its input and output.
import onnxruntime as rt
sess = rt.InferenceSession("pipeline_vectorize.onnx")

import numpy
inp, out = sess.get_inputs()[0], sess.get_outputs()[0]
print("input name='{}' and shape={} and type={}".format(
    inp.name, inp.shape, inp.type))
print("output name='{}' and shape={} and type={}".format(
    out.name, out.shape, out.type))
 def test_model_imputer(self):
     model = Imputer(missing_values='NaN', strategy='mean', axis=0)
     model.fit([[1, 2], [np.nan, 3], [7, 6]])
     model_onnx = convert_sklearn(model, 'scikit-learn imputer',
                                  [('input', Int64TensorType([1, 2]))])
     self.assertTrue(model_onnx is not None)
 def test_standard_scaler(self):
     model = StandardScaler()
     model.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
     model_onnx = convert_sklearn(model, 'scaler',
                                  [('input', Int64TensorType([1, 3]))])
     self.assertTrue(model_onnx is not None)