def _test_single_output_core(self, model): X = [[0, 1], [1, 1], [2, 0]] y = [100, -10, 50] model.fit(X, y) model_onnx = convert_sklearn(model, 'tree-based regressor', [('input', Int64TensorType([1, 2]))]) self.assertTrue(model_onnx is not None)
def test_index_to_string(self): original_data = self.spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") string_indexer_model = string_indexer.fit(original_data) data = string_indexer_model.transform(original_data) model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory", labels=['A', 'B', 'C']) # the input name should match that of what IndexToString.inputCol model_onnx = convert_sparkml( model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("originalCategory").toPandas().values data_np = data.select('categoryIndex').toPandas().values.astype( numpy.int64) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlIndexToString") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['originalCategory'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_one_hot_encoder(self): # categorical_features will be removed in 0.22 (this test will fail by then). model = OneHotEncoder() model.fit([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]]) model_onnx = convert_sklearn(model, 'scikit-learn one-hot encoder', [('input', Int64TensorType([1, 3]))]) self.assertTrue(model_onnx is not None)
def _test_binary_classification_core(self, model): X = [[0, 1], [1, 1], [2, 0]] y = ['A', 'B', 'A'] model.fit(X, y) model_onnx = convert_sklearn(model, 'tree-based binary classifier', [('input', Int64TensorType([1, 2]))]) self.assertTrue(model_onnx is not None)
def calculate_one_vs_rest_output_shapes(operator): check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1) check_input_and_output_types(operator, good_input_types=[FloatTensorType, Int64TensorType]) if len(operator.inputs[0].type.shape) != 2: raise RuntimeError('Input must be a [N, C]-tensor') N = operator.inputs[0].type.shape[0] operator.outputs[0].type = Int64TensorType(shape=[N])
def test_standard_scaler(self): model = StandardScaler() data = [[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]] model.fit(data) model_onnx = convert_sklearn(model, 'scaler', [('input', Int64TensorType([1, 3]))]) self.assertTrue(model_onnx is not None) dump_data_and_model(numpy.array(data, dtype=numpy.int64), model, model_onnx, basename="SklearnStandardScalerInt64")
def test_one_hot_encoder_mixed_float_int(self): # categorical_features will be removed in 0.22 (this test will fail by then). model = OneHotEncoder() model.fit([[0.4, 0.2, 3], [1.4, 1.2, 0], [0.2, 2.2, 1]]) model_onnx = convert_sklearn(model, 'one-hot encoder mixed-type inputs', [('input1', FloatTensorType([1, 2])), ('input2', Int64TensorType([1, 1]))]) self.assertTrue(model_onnx is not None)
def test_one_hot_encoder_mixed_string_int(self): # categorical_features will be removed in 0.22 (this test will fail by then). data = [["0.4", "0.2", 3], ["1.4", "1.2", 0], ["0.2", "2.2", 1]] model = OneHotEncoder(categories='auto') model.fit(data) inputs = [('input1', StringTensorType([1, 2])), ('input2', Int64TensorType([1, 1]))] model_onnx = convert_sklearn(model, 'one-hot encoder mixed-type inputs', inputs) self.assertTrue(model_onnx is not None) dump_data_and_model(data, model, model_onnx, basename="SklearnOneHotEncoderStringInt64", allow_failure=True)
def test_model_imputer(self): model = Imputer(missing_values='NaN', strategy='mean', axis=0) data = [[1, 2], [np.nan, 3], [7, 6]] model.fit(data) # The conversion works but internally scikit-learn converts # everything into float before looking into missing values. # There is no nan integer. The runtime is not tested # in this case. model_onnx = convert_sklearn(model, 'scikit-learn imputer', [('input', Int64TensorType([1, 2]))]) self.assertTrue(model_onnx is not None)
def test_imputer_int_inputs(self): model = Imputer(missing_values='NaN', strategy='mean', axis=0) data = [[1, 2], [np.nan, 3], [7, 6]] model.fit(data) model_onnx = convert_sklearn(model, 'scikit-learn imputer', [('input', Int64TensorType([1, 2]))]) self.assertEqual(len(model_onnx.graph.node), 2) # Last node should be Imputer outputs = model_onnx.graph.output self.assertEqual(len(outputs), 1) self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value, 2)
def test_combine_inputs_floats_ints(self): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline scaler = StandardScaler() scaler.fit([[0, 0.], [0, 0.], [1, 1.], [1, 1.]]) model = Pipeline([('scaler1', scaler), ('scaler2', scaler)]) model_onnx = convert_sklearn(model, 'pipeline', [('input1', Int64TensorType([1, 1])), ('input2', FloatTensorType([1, 1]))]) self.assertTrue(len(model_onnx.graph.node[-1].output) == 1) self.assertTrue(model_onnx is not None)
def calculate_decision_tree_classifier_output_shapes(operator): check_input_and_output_numbers(operator, input_count_range=1, output_count_range=[1, 2]) check_input_and_output_types( operator, good_input_types=[FloatTensorType, Int64TensorType]) if len(operator.inputs[0].type.shape) != 2: raise RuntimeError('Input must be a [N, C]-tensor') N = operator.inputs[0].type.shape[0] class_count = operator.raw_operator.numClasses operator.outputs[0].type = Int64TensorType(shape=[N]) operator.outputs[1].type = FloatTensorType([N, class_count])
def test_model_one_hot_encoder(self): # categorical_features will be removed in 0.22 (this test will fail by then). # FutureWarning: The handling of integer data will change in version 0.22. # Currently, the categories are determined based on the range [0, max(values)], # while in the future they will be determined based on the unique values. # If you want the future behaviour and silence this warning, # you can specify "categories='auto'". model = OneHotEncoder() data = numpy.array([[1, 2, 3], [4, 3, 0], [0, 1, 4], [0, 5, 6]], dtype=numpy.int64) model.fit(data) model_onnx = convert_sklearn(model, 'scikit-learn one-hot encoder', [('input', Int64TensorType([1, 3]))]) self.assertTrue(model_onnx is not None) dump_data_and_model(data, model, model_onnx, basename="SklearnOneHotEncoderInt64-SkipDim1")
def calculate_gbt_classifier_output_shapes(operator): check_input_and_output_numbers(operator, input_count_range=1, output_count_range=[1, 2]) check_input_and_output_types( operator, good_input_types=[FloatTensorType, Int64TensorType]) if len(operator.inputs[0].type.shape) != 2: raise RuntimeError('Input must be a [N, C]-tensor') N = operator.inputs[0].type.shape[0] operator.outputs[0].type = Int64TensorType(shape=[N]) if isinstance(operator.raw_operator, GBTClassificationModel): class_count = 2 operator.outputs[1].type = FloatTensorType([N, class_count])
def test_index_to_string_throws(self): original_data = self.spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") string_indexer_model = string_indexer.fit(original_data) data = string_indexer_model.transform(original_data) model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") # the input name should match that of what IndexToString.inputCol model_onnx = None with pytest.raises(SparkMlConversionError): model_onnx = convert_sparkml( model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))])
def calculate_logistic_regression_output_shapes(operator): ''' This operator maps an input feature vector into a scalar label if the number of outputs is one. If two outputs appear in this operator's output list, we should further generate a map storing all classes' probabilities. Allowed input/output patterns are 1. [N, C] ---> [N, 1], A sequence of map ''' class_count = operator.raw_operator.numClasses check_input_and_output_numbers(operator, input_count_range=1, output_count_range=[1, class_count]) check_input_and_output_types( operator, good_input_types=[FloatTensorType, Int64TensorType]) if len(operator.inputs[0].type.shape) != 2: raise RuntimeError('Input must be a [N, C]-tensor') N = operator.inputs[0].type.shape[0] operator.outputs[0].type = Int64TensorType(shape=[N]) operator.outputs[1].type = FloatTensorType([N, class_count])
def test_combine_inputs_floats_ints(self): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline data = [[0, 0.], [0, 0.], [1, 1.], [1, 1.]] scaler = StandardScaler() scaler.fit(data) model = Pipeline([('scaler1', scaler), ('scaler2', scaler)]) model_onnx = convert_sklearn(model, 'pipeline', [('input1', Int64TensorType([1, 1])), ('input2', FloatTensorType([1, 1]))]) self.assertTrue(len(model_onnx.graph.node[-1].output) == 1) self.assertTrue(model_onnx is not None) data = numpy.array(data) data = { 'input1': data[:, 0].astype(numpy.int64), 'input2': data[:, 1].astype(numpy.float32) } dump_data_and_model(data, PipeConcatenateInput(model), model_onnx, basename="SklearnPipelineScalerMixed-OneOff")
def test_model_normalizer(self): model = Normalizer(norm='l2') model_onnx = convert_sklearn(model, 'scikit-learn normalizer', [('input', Int64TensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(len(model_onnx.graph.node) == 1)
#################################### # Conversion to ONNX format # +++++++++++++++++++++++++ # # We use module # `onnxmltools <https://github.com/onnx/onnxmltools>`_ # to convert the model into ONNX format. from onnxmltools import convert_sklearn from onnxmltools.utils import save_model from onnxmltools.convert.common.data_types import FloatTensorType, Int64TensorType, DictionaryType, SequenceType # initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))] initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))] onx = convert_sklearn(pipe, initial_types=initial_type) save_model(onx, "pipeline_vectorize.onnx") ################################## # We load the model with ONNX Runtime and look at # its input and output. import onnxruntime as rt sess = rt.InferenceSession("pipeline_vectorize.onnx") import numpy inp, out = sess.get_inputs()[0], sess.get_outputs()[0] print("input name='{}' and shape={} and type={}".format( inp.name, inp.shape, inp.type)) print("output name='{}' and shape={} and type={}".format( out.name, out.shape, out.type))
def test_model_imputer(self): model = Imputer(missing_values='NaN', strategy='mean', axis=0) model.fit([[1, 2], [np.nan, 3], [7, 6]]) model_onnx = convert_sklearn(model, 'scikit-learn imputer', [('input', Int64TensorType([1, 2]))]) self.assertTrue(model_onnx is not None)
def test_standard_scaler(self): model = StandardScaler() model.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) model_onnx = convert_sklearn(model, 'scaler', [('input', Int64TensorType([1, 3]))]) self.assertTrue(model_onnx is not None)