コード例 #1
0
    def test_pipeline_voting_tfidf_svc(self):
        pipe1 = Pipeline([
            ('tfidf1', TfidfVectorizer()),
            ('svc', SVC(probability=True, kernel='linear'))])
        pipe2 = Pipeline([
            ('tfidf2', TfidfVectorizer(norm='l2', use_idf=False)),
            ('sgd', SGDClassifier(alpha=0.0001, penalty='l2',
                                  loss='modified_huber'))])
        pipe3 = Pipeline([
            ('tfidf3', TfidfVectorizer()),
            ('mnb', MultinomialNB())])
        voting = VotingClassifier(
            [('p1', pipe1), ('p2', pipe2), ('p3', pipe3)],
            voting='soft', flatten_transform=False)
        data = numpy.array(["first sentance", "second sentence",
                            "many sentances", "dummy sentance",
                            "no sentance at all"])
        y = numpy.array([0, 0, 1, 0, 1])
        voting.fit(data, y)
        expected_label = voting.predict(data)
        expected_proba = voting.predict_proba(data)
        df = pandas.DataFrame(data)
        df.columns = ['text']

        model_onnx = convert_sklearn(
            voting, initial_types=[('text', StringTensorType([None, 1]))],
            target_opset=TARGET_OPSET,
            options={id(voting): {'zipmap': False}})
        # with open("debug.onnx", "wb") as f:
        #     f.write(model_onnx.SerializeToString())
        sess = InferenceSession(model_onnx.SerializeToString())
        got = sess.run(None, {'text': data.reshape((-1, 1))})
        assert_almost_equal(expected_proba, got[1], decimal=5)
        assert_almost_equal(expected_label, got[0])
コード例 #2
0
 def test_pipeline_tfidf_pipeline_minmax(self):
     categories = ["alt.atheism", "talk.religion.misc"]
     train = fetch_20newsgroups(random_state=1,
                                subset="train",
                                categories=categories)
     train_data = SubjectBodyExtractor().fit_transform(train.data)
     pipeline = Pipeline([(
         "union",
         ColumnTransformer(
             [
                 ("subject", TfidfVectorizer(min_df=50), 0),
                 ("body", TfidfVectorizer(min_df=40), 1),
             ],
             transformer_weights={"subject": 0.8},
         ),
     )])
     pipeline.fit(train_data[:300])
     extra = {
         TfidfVectorizer: {
             "sep": [
                 " ",
                 ".",
                 "?",
                 ",",
                 ";",
                 ":",
                 "!",
                 "(",
                 ")",
                 "\n",
                 '"',
                 "'",
                 "-",
                 "[",
                 "]",
                 "@",
             ]
         }
     }
     model_onnx = convert_sklearn(
         pipeline,
         "tfidf",
         initial_types=[("input", StringTensorType([1, 2]))],
         options=extra,
     )
     test_data = np.array([
         ["Albert Einstein", "Not relatively."],
         ["Alan turing", "Not automatically."],
     ])
     dump_data_and_model(
         test_data,
         pipeline,
         model_onnx,
         verbose=False,
         basename="SklearnDocumentationTfIdfUnion1-OneOff-Dec2",
         allow_failure="StrictVersion(onnx.__version__)"
         " <= StrictVersion('1.3') or "
         "StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.1')",
     )
コード例 #3
0
 def test_model_tfidf_vectorizer11_out_vocabulary(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the first document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     pred = vect.transform(corpus.ravel())
     model_onnx = convert_sklearn(vect, 'TfidfVectorizer',
                                  [('input', StringTensorType([1, 1]))])
     self.assertTrue(model_onnx is not None)
     corpus = numpy.array([
         'AZZ ZZ This is the first document.',
         'BZZ ZZ This document is the second document.',
         'ZZZ ZZ And this is the third one.',
         'WZZ ZZ Is this the first document?',
     ]).reshape((4, 1))
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11OutVocab-OneOff-SklCol",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.2.1')")
コード例 #4
0
 def test_ordinal_encoder_mixed_string_int_drop(self):
     data = [
         ["c0.4", "c0.2", 3],
         ["c1.4", "c1.2", 0],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
     ]
     test = [["c0.2", "c2.2", 1]]
     model = OrdinalEncoder(categories="auto")
     model.fit(data)
     inputs = [
         ("input1", StringTensorType([None, 2])),
         ("input2", Int64TensorType([None, 1])),
     ]
     model_onnx = convert_sklearn(
         model, "ordinal encoder", inputs, target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         test,
         model,
         model_onnx,
         basename="SklearnOrdinalEncoderMixedStringIntDrop",
         allow_failure="pv.Version("
         "onnxruntime.__version__)"
         "<= pv.Version('0.5.0')",
     )
 def test_model_tfidf_vectorizer11_out_vocabulary(self):
     corpus = numpy.array([
         "This is the first document.",
         "This document is the second document.",
         "And this is the third one.",
         "Is this the first document?",
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     model_onnx = convert_sklearn(vect, "TfidfVectorizer",
                                  [("input", StringTensorType([1, 1]))],
                                  options=self.get_options())
     self.assertTrue(model_onnx is not None)
     corpus = numpy.array([
         "AZZ ZZ This is the first document.",
         "BZZ ZZ This document is the second document.",
         "ZZZ ZZ And this is the third one.",
         "WZZ ZZ Is this the first document?",
     ]).reshape((4, 1))
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11OutVocab-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__)"
                       " <= StrictVersion('0.4.0')",
     )
 def test_model_tfidf_vectorizer11parenthesis_class(self):
     corpus = numpy.array([
         "This is the first document.",
         "This document is the second document.",
         "And this is the third one.",
         "Is this the (first) document?",
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     extra = {
         TfidfVectorizer: {
             "separators": [
                 " ", "\\.", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
             ]
         }
     }
     model_onnx = convert_sklearn(
         vect,
         "TfidfVectorizer",
         [("input", StringTensorType([1, 1]))],
         options=extra,
     )
     self.assertTrue(model_onnx is not None)
     # This test depends on this issue:
     # https://github.com/Microsoft/onnxruntime/issues/957.
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11ParenthesisClass-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__)"
                       " <= StrictVersion('0.4.0')",
     )
コード例 #7
0
    def test_model_count_vectorizer_wrong_ngram(self):
        corpus = numpy.array([
            'A AABBB0',
            'AAABB B1',
            'AA ABBB2',
            'AAAB BB3',
            'AAA BBB4',
        ]).reshape((5, 1))
        vect = TfidfVectorizer(ngram_range=(1, 2),
                               token_pattern=r"(?u)\b\w\w+\b")
        vect.fit(corpus.ravel())

        model_onnx = convert_sklearn(vect,
                                     'TfidfVectorizer',
                                     [('input', StringTensorType([1]))],
                                     target_opset=TARGET_OPSET)

        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            corpus,
            vect,
            model_onnx,
            basename="SklearnTfidfVectorizer12Wngram-OneOff-SklCol",
            allow_failure="pv.Version(onnxruntime.__version__) <= "
            "pv.Version('0.3.0')")
コード例 #8
0
    def test_pipeline_column_transformer(self):

        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1"
                                              if x > 0.5 else "cat2")
        X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3"
                                               if x > 0.5 else "cat4")
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(C=0.01,
                                        class_weight=dict(
                                            zip([False, True], [0.2, 0.8])),
                                        n_jobs=1,
                                        max_iter=10,
                                        solver="lbfgs",
                                        tol=1e-3)

        numeric_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])

        categorical_transformer = Pipeline(steps=[
            (
                "onehot",
                OneHotEncoder(sparse=True, handle_unknown="ignore"),
            ),
            (
                "tsvd",
                TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4),
            ),
        ])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)
        initial_type = [
            ("numfeat", FloatTensorType([None, 3])),
            ("strfeat", StringTensorType([None, 2])),
        ]

        X_train = X_train[:11]
        model_onnx = convert_sklearn(model, initial_types=initial_type)

        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipeliner")
コード例 #9
0
 def test_model_tfidf_vectorizer11parenthesis_class(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the (first) document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     pred = vect.transform(corpus.ravel())
     extra = {
         TfidfVectorizer: {
             'sep': [' ', '.', '?', ',', ';', ':', '!', '(', ')']
         }
     }
     model_onnx = convert_sklearn(vect,
                                  'TfidfVectorizer',
                                  [('input', StringTensorType([1, 1]))],
                                  options=extra)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11ParenthesisClass-OneOff-SklCol",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.2.0') or "
         "StrictVersion(onnx.__version__) <= StrictVersion('1.3')")
コード例 #10
0
    def test_model_count_vectorizer_custom_tokenizer(self):
        corpus = numpy.array([
            '9999',
            '999 99',
            '1234',
            '1 2 3 4',
            '1 2 3 4+',
        ]).reshape((5, 1))
        vect = CountVectorizer(ngram_range=(1, 1),
                               tokenizer=lambda s: [s])
        vect.fit(corpus.ravel())

        extra = {
            CountVectorizer: {
                "separators": ["ZZZZ"]
            }
        }

        prev = vect.tokenizer
        vect.tokenizer = None
        model_onnx = convert_sklearn(vect, 'CountVectorizer',
                                     [('input', StringTensorType([1, 1]))],
                                     options=extra)
        vect.tokenizer = prev

        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            corpus, vect, model_onnx,
            basename="SklearnTfidfVectorizer11CustomTokenizer-OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__) <= "
                          "StrictVersion('0.4.0')")
コード例 #11
0
 def test_one_hot_encoder_string_drop_first(self):
     data = [['Male', 'First'], ['Female', 'First'], ['Female', 'Second']]
     test_data = [['Male', 'Second']]
     model = OneHotEncoder(drop='first',
                           categories='auto')
     model.fit(data)
     inputs = [
         ("input1", StringTensorType([None, 1])),
         ("input2", StringTensorType([None, 1])),
     ]
     model_onnx = convert_sklearn(
         model, "one-hot encoder", inputs, target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         test_data, model, model_onnx,
         basename="SklearnOneHotEncoderStringDropFirst")
コード例 #12
0
    def test_onnxrt_tokenizer_word_stop(self):
        corpus = numpy.array(['abc ef zoo', 'abc,d', 'ab/e'])
        exp = numpy.array([['abc', 'ef', 'zoo'], ['abc', '#', '#'],
                           ['ab', 'e', '#']])

        op = OnnxTokenizer('text',
                           op_version=TARGET_OPSET,
                           output_names=['out'],
                           separators=[' ', ',', '/'],
                           mark=0,
                           stopwords=['d'])
        onx = op.to_onnx(inputs=[('text', StringTensorType())],
                         outputs=[('out', StringTensorType())])
        oinf = OnnxInference(onx)
        res = oinf.run({'text': corpus})
        self.assertEqual(res['out'].tolist(), exp.tolist())
コード例 #13
0
    def test_onnxrt_tokenizer_word_regex_mark_findall(self):
        corpus = numpy.array(['abc ef zoo', 'abc,d', 'ab/e'])
        exp = numpy.array([['#', 'abc', '#'], ['#', 'abc', '#'],
                           ['#', 'ab', '#']])

        op = OnnxTokenizer('text',
                           op_version=TARGET_OPSET,
                           output_names=['out'],
                           mark=1,
                           tokenexp='[a-c]+',
                           tokenexpsplit=0)
        onx = op.to_onnx(inputs=[('text', StringTensorType())],
                         outputs=[('out', StringTensorType())])
        oinf = OnnxInference(onx)
        res = oinf.run({'text': corpus})
        self.assertEqual(res['out'].tolist(), exp.tolist())
コード例 #14
0
 def test_model_tfidf_vectorizer11parenthesis_class(self):
     corpus = numpy.array([
         "This is the first document.",
         "This document is the second document.",
         "And this is the third one.",
         "Is this the (first) document?",
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     extra = {
         TfidfVectorizer: {
             "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
         }
     }
     model_onnx = convert_sklearn(
         vect,
         "TfidfVectorizer",
         [("input", StringTensorType([1, 1]))],
         options=extra,
     )
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11ParenthesisClass-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.0') or "
         "StrictVersion(onnx.__version__)"
         " <= StrictVersion('1.3')",
     )
コード例 #15
0
 def test_model_tfidf_vectorizer11parenthesis_class(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the (first) document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     extra = {
         TfidfVectorizer: {
             'separators':
             [' ', '[.]', '\\?', ',', ';', ':', '\\!', '\\(', '\\)'],
             'tokenexp':
             None
         }
     }
     model_onnx = convert_sklearn(vect,
                                  'TfidfVectorizer',
                                  [('input', StringTensorType([1]))],
                                  options=extra,
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     # This test depends on this issue:
     # https://github.com/Microsoft/onnxruntime/issues/957.
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11ParenthesisClassRegex-"
         "OneOff-SklCol")
コード例 #16
0
 def test_model_tfidf_vectorizer11_out_vocabulary(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the first document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     model_onnx = convert_sklearn(vect,
                                  'TfidfVectorizer',
                                  [('input', StringTensorType([1]))],
                                  options=self.get_options(),
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     corpus = numpy.array([
         'AZZ ZZ This is the first document.',
         'BZZ ZZ This document is the second document.',
         'ZZZ ZZ And this is the third one.',
         'WZZ ZZ Is this the first document?',
     ]).reshape((4, 1))
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11OutVocabRegex-OneOff-SklCol")
コード例 #17
0
 def test_model_tfidf_vectorizer11_custom_vocabulary(self):
     corpus = numpy.array([
         "This is the first document.",
         "This document is the second document.",
         "And this is the third one.",
         "Is this the first document?",
     ]).reshape((4, 1))
     vc = ["first", "second", "third", "document", "this"]
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, vocabulary=vc)
     vect.fit(corpus.ravel())
     self.assertFalse(hasattr(vect, "stop_words_"))
     model_onnx = convert_sklearn(vect,
                                  "TfidfVectorizer",
                                  [("input", StringTensorType())],
                                  options=self.get_options(),
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11CustomVocab-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.4.0')",
     )
コード例 #18
0
 def test_one_hot_encoder_mixed_string_int_drop(self):
     data = [
         ["c0.4", "c0.2", 3],
         ["c1.4", "c1.2", 0],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
     ]
     test = [["c0.2", "c2.2", 1]]
     model = OneHotEncoder(categories="auto", drop=['c0.4', 'c0.2', 3])
     model.fit(data)
     inputs = [
         ("input1", StringTensorType([None, 2])),
         ("input2", Int64TensorType([None, 1])),
     ]
     model_onnx = convert_sklearn(model,
                                  "one-hot encoder",
                                  inputs,
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(test,
                         model,
                         model_onnx,
                         verbose=False,
                         basename="SklearnOneHotEncoderMixedStringIntDrop")
コード例 #19
0
 def test_model_tfidf_vectorizer11parenthesis_class(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the (first) document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     extra = {
         TfidfVectorizer: {
             'sep': [' ', '.', '?', ',', ';', ':', '!', '(', ')'],
             'regex': None
         }
     }
     model_onnx = convert_sklearn(vect,
                                  'TfidfVectorizer',
                                  [('input', StringTensorType([1, 1]))],
                                  options=extra)
     self.assertTrue(model_onnx is not None)
     # This test depends on this issue:
     # https://github.com/Microsoft/onnxruntime/issues/957.
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11ParenthesisClassRegex-"
         "OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__) <= "
         "StrictVersion('0.4.0')")
コード例 #20
0
 def test_one_hot_encoder_mixed_string_int(self):
     # categorical_features will be removed in 0.22
     # (this test will fail by then).
     data = [
         ["c0.4", "c0.2", 3],
         ["c1.4", "c1.2", 0],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
     ]
     model = OneHotEncoder(categories="auto")
     model.fit(data)
     inputs = [
         ("input1", StringTensorType([None, 2])),
         ("input2", Int64TensorType([None, 1])),
     ]
     model_onnx = convert_sklearn(model,
                                  "one-hot encoder mixed-type inputs",
                                  inputs)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         data,
         model,
         model_onnx,
         basename="SklearnOneHotEncoderStringInt64",
         verbose=False,
     )
 def test_model_tfidf_transform_bug(self):
     categories = [
         "alt.atheism",
         "soc.religion.christian",
         "comp.graphics",
         "sci.med",
     ]
     twenty_train = fetch_20newsgroups(subset="train",
                                       categories=categories,
                                       shuffle=True,
                                       random_state=0)
     text_clf = Pipeline([("vect", CountVectorizer()),
                          ("tfidf", TfidfTransformer())])
     twenty_train.data[0] = "bruît " + twenty_train.data[0]
     text_clf.fit(twenty_train.data, twenty_train.target)
     model_onnx = convert_sklearn(
         text_clf,
         name="DocClassifierCV-Tfidf",
         initial_types=[("input", StringTensorType([5]))],
     )
     dump_data_and_model(
         twenty_train.data[5:10],
         text_clf,
         model_onnx,
         basename="SklearnPipelineTfidfTransformer",
         # Operator mul is not implemented in onnxruntime
         allow_failure="StrictVersion(onnx.__version__)"
                       " <= StrictVersion('1.5')",
     )
コード例 #22
0
 def test_pipeline_tfidf(self):
     categories = ["alt.atheism", "talk.religion.misc"]
     train = fetch_20newsgroups(random_state=1,
                                subset="train",
                                categories=categories)
     train_data = SubjectBodyExtractor().fit_transform(train.data)
     tfi = TfidfVectorizer(min_df=30)
     tdata = train_data[:300, :1]
     tfi.fit(tdata.ravel())
     extra = {
         TfidfVectorizer: {
             "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
         }
     }
     model_onnx = convert_sklearn(
         tfi,
         "tfidf",
         initial_types=[("input", StringTensorType([1, 1]))],
         options=extra,
     )
     dump_data_and_model(
         tdata[:5],
         tfi,
         model_onnx,
         basename="SklearnDocumentationTfIdf-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__)"
                       " <= StrictVersion('0.4.0')",
     )
コード例 #23
0
 def test_one_hot_encoder_one_string_one_int_cat(self):
     # categorical_features will be removed in 0.22
     # (this test will fail by then).
     data = [['Male', 1], ['Female', 3], ['Female', 2]]
     test_data = [['Unknown', 4]]
     sig = inspect.signature(OneHotEncoder)
     if "categorical_features" in sig.parameters:
         # scikit-learn < 0.21
         model = OneHotEncoder(handle_unknown='ignore',
                               categorical_features='all')
     elif "categories" in sig.parameters:
         # scikit-learn >= 0.22
         model = OneHotEncoder(handle_unknown='ignore', categories='auto')
     else:
         raise AssertionError("scikit-learn's API has changed.")
     model.fit(data)
     inputs = [("input1", StringTensorType([None, 1])),
               ("input2", Int64TensorType([None, 1]))]
     model_onnx = convert_sklearn(
         model, "one-hot encoder one string and int categories", inputs)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         test_data,
         model,
         model_onnx,
         basename="SklearnOneHotEncoderOneStringOneIntCat",
     )
    def test_model_tfidf_vectorizer11_nolowercase(self):
        corpus = numpy.array([
            "This is the first document.",
            "This document is the second document.",
            "And this is the third one.",
            "Is this the first document?",
        ]).reshape((4, 1))
        vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, lowercase=False)
        vect.fit(corpus.ravel())
        model_onnx = convert_sklearn(vect,
                                     "TfidfVectorizer",
                                     [("input", StringTensorType())],
                                     options=self.get_options(),
                                     target_opset=TARGET_OPSET)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            corpus,
            vect,
            model_onnx,
            basename="SklearnTfidfVectorizer11NoL-OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            " <= StrictVersion('0.4.0')",
        )

        sess = InferenceSession(model_onnx.SerializeToString())
        res = sess.run(None, {'input': corpus.ravel()})[0]
        assert res.shape == (4, 11)
    def test_model_tfidf_vectorizer11_empty_string_case1(self):
        corpus = numpy.array([
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            ' ',
        ]).reshape((4, 1))
        vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
        vect.fit(corpus[:3].ravel())
        model_onnx = convert_sklearn(vect,
                                     'TfidfVectorizer',
                                     [('input', StringTensorType([1]))],
                                     options=self.get_options(),
                                     target_opset=TARGET_OPSET)
        self.assertTrue(model_onnx is not None)

        # TfidfVectorizer in onnxruntime fails with empty strings,
        # which was fixed in version 0.3.0 afterward
        dump_data_and_model(
            corpus[2:],
            vect,
            model_onnx,
            basename="SklearnTfidfVectorizer11EmptyStringSepCase1-"
            "OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            " <= StrictVersion('0.4.0')")
 def test_model_tfidf_vectorizer11_opset(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the first document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     for opset in range(8, TARGET_OPSET + 1):
         try:
             model_onnx = convert_sklearn(
                 vect,
                 'TfidfVectorizer', [('input', StringTensorType([1]))],
                 options=self.get_options(),
                 target_opset=opset)
         except RuntimeError as e:
             if "only works for opset" in str(e):
                 continue
             raise e
         self.assertTrue(model_onnx is not None)
         if opset >= 10:
             name = "SklearnTfidfVectorizer11Rx%d-OneOff-SklCol" % opset
             dump_data_and_model(
                 corpus,
                 vect,
                 model_onnx,
                 basename=name,
                 allow_failure="StrictVersion(onnxruntime.__version__) <= "
                 "StrictVersion('0.4.0')")
コード例 #27
0
 def test_pipeline_tfidf(self):
     categories = ["alt.atheism", "talk.religion.misc"]
     try:
         train = fetch_20newsgroups(random_state=1,
                                    subset="test",
                                    categories=categories)
     except urllib.error.URLError:
         warnings.warn("Unit test may fail due to connectivity issue.")
         return
     train_data = SubjectBodyExtractor().fit_transform(train.data)
     tfi = TfidfVectorizer(min_df=30)
     tdata = train_data[:300, :1]
     tfi.fit(tdata.ravel())
     extra = {
         TfidfVectorizer: {
             "separators": [
                 " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
             ]
         }
     }
     model_onnx = convert_sklearn(
         tfi, "tfidf",
         initial_types=[("input", StringTensorType([1]))],
         options=extra, target_opset=TARGET_OPSET
     )
     dump_data_and_model(
         tdata[:5],
         tfi,
         model_onnx,
         basename="SklearnDocumentationTfIdf-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__)"
                       " <= StrictVersion('0.4.0')",
     )
コード例 #28
0
    def test_pipeline_tfidf_svc(self):
        pipe = Pipeline([('tfidf', TfidfVectorizer()),
                         ('clf_svc', SVC(probability=True, kernel='linear'))])
        data = numpy.array([
            "first sentance", "second sentence", "many sentances",
            "dummy sentance", "no sentance at all"
        ])
        y = numpy.array([0, 0, 1, 0, 1])
        pipe.fit(data, y)
        expected_label = pipe.predict(data)
        expected_proba = pipe.predict_proba(data)
        df = pandas.DataFrame(data)
        df.columns = ['text']

        # first conversion if shape=[None, 1]
        model_onnx = convert_sklearn(pipe,
                                     initial_types=[
                                         ('text', StringTensorType([None, 1]))
                                     ],
                                     target_opset=TARGET_OPSET,
                                     options={id(pipe): {
                                                  'zipmap': False
                                              }})
        sess = InferenceSession(model_onnx.SerializeToString())
        got = sess.run(None, {'text': data.reshape((-1, 1))})
        assert_almost_equal(expected_proba, got[1])
        assert_almost_equal(expected_label, got[0])
        # sess.run(None, {'text': df}) --> failures
        # sess.run(None, {'text': df["text"]}) --> failures

        # second conversion with shape=[None]
        model_onnx = convert_sklearn(pipe,
                                     initial_types=[
                                         ('text', StringTensorType([None]))
                                     ],
                                     target_opset=TARGET_OPSET,
                                     options={id(pipe): {
                                                  'zipmap': False
                                              }})
        sess = InferenceSession(model_onnx.SerializeToString())
        got = sess.run(None, {'text': data})
        assert_almost_equal(expected_proba, got[1])
        assert_almost_equal(expected_label, got[0])
        # sess.run(None, {'text': df})  failure
        # sess.run(None, {'text': df["text"]})  failure
        sess.run(None, {'text': df["text"].values})  # success
コード例 #29
0
 def test_model_dict_vectorizer_issue(self):
     key_value_map = [{1: 'A', 2: 'B'}, {1: 'C', 3: 'D'}, {1: 'C', 3: 'A'}]
     model = DictVectorizer(sparse=False).fit(key_value_map)
     with self.assertRaises(RuntimeError):
         convert_sklearn(model, 'dv',
                         [("input",
                           DictionaryType(Int64TensorType([1]),
                                          StringTensorType([1])))])
コード例 #30
0
def _problem_for_tfidf_vectorizer(dtype=numpy.float32, n_features=None):
    """
    Returns a problem for the :epkg:`sklearn:feature_extraction:text:TfidfVectorizer`.
    """
    X = numpy.array([_[0] for _ in text_alpha_num])
    y = numpy.array([_[1] for _ in text_alpha_num], dtype=dtype)
    itt = [("X", StringTensorType([None]))]
    return (X, y, itt, 'transform', 0, X)