def test_sklearn_knn_imputer(self): x_train = numpy.array([[1, 2, numpy.nan, 12], [3, numpy.nan, 3, 13], [1, 4, numpy.nan, 1], [numpy.nan, 4, 3, 12]], dtype=numpy.float32) x_test = numpy.array( [[1.3, 2.4, numpy.nan, 1], [-1.3, numpy.nan, 3.1, numpy.nan]], dtype=numpy.float32) model = KNNImputer(n_neighbors=3, metric='nan_euclidean').fit(x_train) for opset in [9, 10, 11]: model_onnx = convert_sklearn( model, "KNN imputer", [("input", FloatTensorType((None, x_test.shape[1])))], target_opset=opset, ) self.assertIsNotNone(model_onnx) dump_data_and_model( x_test, model, model_onnx, basename="SklearnKNNImputer", )
def test_model_knn_regressor_double(self): model, X = self._fit_model(KNeighborsRegressor(n_neighbors=2)) model_onnx = convert_sklearn(model, "KNN regressor", [("input", DoubleTensorType([None, 4]))], target_opset=TARGET_OPSET, options={id(model): { 'optim': 'cdist' }}) self.assertIsNotNone(model_onnx) try: InferenceSession(model_onnx.SerializeToString()) except OrtImpl as e: if ("Could not find an implementation for the node " "To_TopK:TopK(11)") in str(e): # onnxruntime does not declare TopK(11) for double return raise e dump_data_and_model(X.astype(numpy.float64)[:7], model, model_onnx, basename="SklearnKNeighborsRegressor64")
def test_model_bayesian_mixture_binary_classification(self): for cov in ["full", "tied", "diag", "spherical"]: with self.subTest(cov=cov): model, X = self._fit_model_binary_classification( BayesianGaussianMixture(), load_iris(), covariance_type=cov) model_onnx = convert_sklearn( model, "gaussian_mixture", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET ) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnBinBayesianGaussianMixture", allow_failure="StrictVersion(onnxruntime.__version__)" "<= StrictVersion('0.2.1')", ) self._test_score(model, X, TARGET_OPSET)
def test_remainder_passthrough(self): def convert_dataframe_schema(df, drop=None): inputs = [] for k, v in zip(df.columns, df.dtypes): if drop is not None and k in drop: continue if v == "int64": t = Int64TensorType([None, 1]) elif v == "float64": t = FloatTensorType([None, 1]) else: t = StringTensorType([None, 1]) inputs.append((k, t)) return inputs data = load_iris() X = data.data[:, :2] y = data.target data = pandas.DataFrame(X, columns=["X1", "X2"]) pipe = Pipeline(steps=[ ("select", ColumnTransformer([("id", FunctionTransformer(), ["X1"])], remainder="passthrough")), ("logreg", LogisticRegression()), ]) pipe.fit(data[["X1", "X2"]], y) inputs = convert_dataframe_schema(data) model_onnx = convert_sklearn(pipe, "scikit-learn function_transformer", inputs, target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( data[:5], pipe, model_onnx, basename="SklearnFunctionTransformer-DF", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.3') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')")
def test_feature_union_transformer_weights_0(self): data = load_iris() X, y = data.data, data.target X = X.astype(np.float32) X_train, X_test, *_ = train_test_split(X, y, test_size=0.5, random_state=42) model = FeatureUnion([('standard', StandardScaler()), ('minmax', MinMaxScaler())], transformer_weights={ 'standard': 2, 'minmax': 4 }).fit(X_train) model_onnx = convert_sklearn( model, 'feature union', [('input', FloatTensorType([None, X_test.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model(X_test, model, model_onnx, basename="SklearnFeatureUnionTransformerWeights0")
def test_model_tfidf_vectorizer11_empty_string(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', '', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options()) self.assertTrue(model_onnx is not None) # TfidfVectorizer in onnxruntime fails with empty strings dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11EmptyStringRegex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) " "<= StrictVersion('0.4.0')")
def test_model_logistic_regression_binary_class_decision_function(self): model, X = fit_classification_model( linear_model.LogisticRegression(max_iter=10000), 2) model_onnx = convert_sklearn( model, "logistic regression", [("input", FloatTensorType([None, X.shape[1]]))], options={linear_model.LogisticRegression: { 'raw_scores': True }}) self.assertIsNotNone(model_onnx) dump_data_and_model( X[:5], model, model_onnx, basename="SklearnLogitisticRegressionBinaryRawScore", # Operator cast-1 is not implemented in onnxruntime allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.3') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", methods=['predict', 'decision_function_binary'])
def test_way3_mixin(self): X = np.arange(20).reshape(10, 2) tr = KMeans(n_clusters=2) tr.fit(X) try: tr_mixin = wrap_as_onnx_mixin(tr, target_opset=TARGET_OPSET) except KeyError as e: assert ("SklearnGaussianProcessRegressor" in str(e) or "SklearnGaussianProcessClassifier" in str(e)) return try: onx = tr_mixin.to_onnx() except RuntimeError as e: assert "Method enumerate_initial_types" in str(e) onx = tr_mixin.to_onnx(X.astype(np.float32)) dump_data_and_model( X.astype(np.float32), tr, onx, basename="MixinWay3OnnxMixin")
def test_model_count_vectorizer12(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) vect = CountVectorizer(ngram_range=(1, 2)) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, "CountVectorizer", [("input", StringTensorType([1]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnCountVectorizer12-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.3.0')", )
def test_pca_default_int_randomised(self): data = load_digits() X_train, X_test, *_ = train_test_split(data.data, data.target, test_size=0.2, random_state=42) model = PCA(random_state=42, svd_solver='randomized', iterated_power=3).fit(X_train) model_onnx = convert_sklearn( model, initial_types=[("input", Int64TensorType(shape=X_test.shape))], ) self.assertTrue(model_onnx is not None) dump_data_and_model( X_test.astype(np.int64), model, model_onnx, basename="SklearnPCADefaultIntRandomised", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
def test_way4_mixin_fit(self): X = np.arange(20).reshape(10, 2) try: tr = wrap_as_onnx_mixin(KMeans(n_clusters=2), target_opset=TARGET_OPSET) except KeyError as e: assert ("SklearnGaussianProcessRegressor" in str(e) or "SklearnGaussianProcessClassifier" in str(e)) return tr.fit(X) onx = tr.to_onnx(X.astype(np.float32)) if TARGET_OPSET == 11: sonx = str(onx) if "version: 11" not in sonx or "ir_version: 6" not in sonx: raise AssertionError("Issue with TARGET_OPSET: {}\n{}".format( TARGET_OPSET, sonx)) dump_data_and_model( X.astype(np.float32), tr, onx, basename="MixinWay4OnnxMixin2")
def test_model_sgd_multi_class_log_decision_function(self): model, X = fit_classification_model( SGDClassifier(loss='log', random_state=42), 3) options = {id(model): {'raw_scores': True}} model_onnx = convert_sklearn( model, "scikit-learn SGD multi-class classifier", [("input", FloatTensorType([None, X.shape[1]]))], options=options, target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(np.float32), model, model_onnx, basename="SklearnSGDClassifierMultiLogDecisionFunction-Dec3", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", methods=['predict', 'decision_function'], )
def common_test_cast_regressor(self, dtype, input_type): model = CastRegressor(DecisionTreeRegressor(max_depth=2), dtype=dtype) data = numpy.array( [[0.1, 0.2, 3.1], [1, 1, 0], [0, 2, 1], [1, 0, 2], [0.1, 2.1, 1.1], [1.1, 0.1, 2.2], [-0.1, -2.1, -1.1], [-1.1, -0.1, -2.2], [0.2, 2.2, 1.2], [1.2, 0.2, 2.2]], dtype=numpy.float32) y = (numpy.sum(data, axis=1, keepdims=0) + numpy.random.randn(data.shape[0])) model.fit(data, y) pred = model assert pred.dtype == dtype model_onnx = convert_sklearn(model, "cast", [("input", FloatTensorType([None, 3]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model(data, model, model_onnx, basename="SklearnCastRegressor{}".format( input_type.__class__.__name__))
def test_bagging_classifier_sgd_binary_decision_function(self): model, X = fit_classification_model( BaggingClassifier(SGDClassifier(random_state=42), random_state=42), 2) options = {id(model): {'raw_scores': True}} model_onnx = convert_sklearn( model, "bagging classifier", [("input", FloatTensorType([None, X.shape[1]]))], dtype=np.float32, options=options, target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X[:5], model, model_onnx, basename="SklearnBaggingClassifierSGDBinaryDecisionFunction-Dec3", allow_failure="StrictVersion(onnxruntime.__version__)" "<= StrictVersion('0.2.1')", methods=['predict', 'decision_function_binary'], )
def test_model_sgd_multi_class_elasticnet_power_t(self): model, X = fit_classification_model( SGDClassifier(penalty='elasticnet', l1_ratio=0.3, power_t=2, random_state=42), 5) model_onnx = convert_sklearn( model, "scikit-learn SGD multi-class classifier", [("input", FloatTensorType([None, X.shape[1]]))], ) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(np.float32), model, model_onnx, basename="SklearnSGDClassifierMultiElasticnetPowerT-Out0", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_model_logistic_regression_multi_class_nocl(self): model, X = fit_classification_model( linear_model.LogisticRegression(max_iter=10000), 4, label_string=True) model_onnx = convert_sklearn( model, "multi-class logistic regression", [("input", FloatTensorType([None, X.shape[1]]))], options={id(model): { 'nocl': True }}, target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) sonx = str(model_onnx) assert 'classlabels_strings' not in sonx assert 'cl0' not in sonx dump_data_and_model(X, model, model_onnx, classes=model.classes_, basename="SklearnLogitisticRegressionMultiNoCl")
def test_model_sgd_multi_class_log_int(self): model, X = fit_classification_model(SGDClassifier(loss='log', random_state=42), 5, is_int=True) model_onnx = convert_sklearn( model, "scikit-learn SGD multi-class classifier", [("input", Int64TensorType([None, X.shape[1]]))], ) X = X[6:8] self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnSGDClassifierMultiLogInt", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_model_logistic_regression_multi_class_saga_elasticnet(self): if _sklearn_version() < StrictVersion('0.21.0'): model, X = fit_classification_model( linear_model.LogisticRegression(solver='saga', max_iter=10000), 3) else: model, X = fit_classification_model( linear_model.LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.1, max_iter=10000), 3) model_onnx = convert_sklearn( model, "multi-class logistic regression", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnLogitisticRegressionMultiSagaElasticnet")
def test_model_tfidf_vectorizer11_word4(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, token_pattern="[a-zA-Z]{1,4}") vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11Regex4-OneOff-SklCol")
def test_model_logistic_regression_binary_class_string(self): model, X = fit_classification_model( linear_model.LogisticRegression(max_iter=100), 2, label_string=True) model_onnx = convert_sklearn( model, "logistic regression", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, basename="SklearnLogitisticRegressionBinary") if StrictVersion(ort_version) >= StrictVersion("1.0.0"): sess = InferenceSession(model_onnx.SerializeToString()) out = sess.get_outputs() lb = out[0].type sh = out[0].shape self.assertEqual(str(lb), "tensor(string)") self.assertEqual(sh, [None])
def test_model_tfidf_vectorizer11_short_word_spaces(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', ]).reshape((2, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, analyzer='word', token_pattern=".{1,3}") vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11CharW2-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.3.0')", verbose=False)
def test_rfecv_int(self): model = RFECV(estimator=SVR(kernel="linear"), cv=3) X = np.array( [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]], dtype=np.int64, ) y = np.array([0, 1, 0, 1]) model.fit(X, y) model_onnx = convert_sklearn( model, "rfecv", [("input", Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnRFECV", methods=["transform"], allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_model_tfidf_vectorizer13(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 3), norm=None) vect.fit(corpus.ravel()) model_onnx = convert_sklearn(vect, "TfidfVectorizer", [("input", StringTensorType([1, 1]))], options=self.get_options()) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer13-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.3.0')", )
def test_grid_search_gaussian_regressor_double(self): tuned_parameters = [{'alpha': np.logspace(-4, -0.5, 4)}] clf = GridSearchCV(GaussianProcessRegressor(), tuned_parameters, cv=3) model, X = fit_regression_model(clf) model_onnx = convert_sklearn( model, "GridSearchCV", [("input", DoubleTensorType([None, X.shape[1]]))], dtype=np.float64, target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(np.float64), model, model_onnx, basename="SklearnGridSearchGaussianRegressionDouble" "-OneOffArray-Dec4", allow_failure="StrictVersion(" "onnxruntime.__version__) " "<= StrictVersion('0.4.0') or " "StrictVersion(onnx.__version__) " "== StrictVersion('1.4.1')", )
def test_model_tfidf_vectorizer11_custom_vocabulary(self): corpus = numpy.array([ "This is the first document.", "This document is the second document.", "And this is the third one.", "Is this the first document?", ]).reshape((4, 1)) vc = ["first", "second", "third", "document", "this"] vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, vocabulary=vc) vect.fit(corpus.ravel()) self.assertFalse(hasattr(vect, "stop_words_")) model_onnx = convert_sklearn(vect, "TfidfVectorizer", [("input", StringTensorType())], options=self.get_options(), target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11CustomVocab-OneOff-SklCol")
def test_grid_search_binary_int(self): tuned_parameters = [{'C': np.logspace(-1, 0, 4)}] clf = GridSearchCV( LogisticRegression(random_state=42, max_iter=100, solver='lbfgs', multi_class='ovr'), tuned_parameters, cv=5) model, X = fit_classification_model(clf, n_classes=2, is_int=True) model_onnx = convert_sklearn( model, "GridSearchCV", [("input", Int64TensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET ) self.assertIsNotNone(model_onnx) dump_data_and_model( X, model, model_onnx, basename="SklearnGridSearchBinaryInt-Dec4", allow_failure="StrictVersion(" "onnxruntime.__version__)" "<= StrictVersion('0.2.1')", )
def test_model_knn_multi_class_nocl(self): model, X = fit_classification_model(KNeighborsClassifier(), 2, label_string=True) model_onnx = convert_sklearn( model, "KNN multi-class nocl", [("input", FloatTensorType([None, X.shape[1]]))], options={id(model): { 'nocl': True }}, target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) sonx = str(model_onnx) assert 'classlabels_strings' not in sonx assert 'cl0' not in sonx dump_data_and_model(X, model, model_onnx, classes=model.classes_, basename="SklearnKNNMultiNoCl", verbose=False)
def test_model_sgd_binary_class_log_l1_no_intercept(self): model, X = fit_classification_model( SGDClassifier(loss='log', penalty='l1', fit_intercept=False, random_state=42), 2) model_onnx = convert_sklearn( model, "scikit-learn SGD binary classifier", [("input", FloatTensorType([None, X.shape[1]]))], ) self.assertIsNotNone(model_onnx) dump_data_and_model( X.astype(np.float32), model, model_onnx, basename="SklearnSGDClassifierBinaryLogL1NoIntercept-Dec4", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_model_knn_classifier_multilabel(self): model, X_test = fit_multilabel_classification_model( KNeighborsClassifier(), n_classes=7, n_labels=3, n_samples=100, n_features=10) options = {id(model): {'zipmap': False}} model_onnx = convert_sklearn( model, "scikit-learn KNN Classifier", [("input", FloatTensorType([None, X_test.shape[1]]))], options=options, target_opset=TARGET_OPSET ) self.assertTrue(model_onnx is not None) assert 'zipmap' not in str(model_onnx).lower() dump_data_and_model( X_test, model, model_onnx, basename="SklearnKNNClassifierMultiLabel-Out0", allow_failure="StrictVersion(" "onnxruntime.__version__) <= StrictVersion('0.2.1')", )
def test_combine_inputs_union_in_pipeline(self): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline data = numpy.array( [[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], dtype=numpy.float32, ) model = Pipeline([ ("scaler1", StandardScaler()), ( "union", FeatureUnion([ ("scaler2", StandardScaler()), ("scaler3", MinMaxScaler()), ]), ), ]) model.fit(data) model_onnx = convert_sklearn( model, "pipeline", [ ("input1", FloatTensorType([None, 1])), ("input2", FloatTensorType([None, 1])), ], ) self.assertTrue(len(model_onnx.graph.node[-1].output) == 1) self.assertTrue(model_onnx is not None) data = { "input1": data[:, 0].reshape((-1, 1)), "input2": data[:, 1].reshape((-1, 1)), } dump_data_and_model( data, PipeConcatenateInput(model), model_onnx, basename="SklearnPipelineScaler11Union", )