def test_model_count_vectorizer_wrong_ngram(self):
        corpus = numpy.array([
            'A AABBB0',
            'AAABB B1',
            'AA ABBB2',
            'AAAB BB3',
            'AAA BBB4',
        ]).reshape((5, 1))
        vect = TfidfVectorizer(ngram_range=(1, 2),
                               token_pattern=r"(?u)\b\w\w+\b")
        vect.fit(corpus.ravel())

        model_onnx = convert_sklearn(vect, 'TfidfVectorizer',
                                     [('input', StringTensorType([1]))])

        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            corpus,
            vect,
            model_onnx,
            basename="SklearnTfidfVectorizer12Wngram-OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__) <= "
            "StrictVersion('0.3.0')")
    def test_model_tfidf_vectorizer11_empty_string_case1(self):
        corpus = numpy.array([
                'This is the first document.',
                'This document is the second document.',
                'And this is the third one.',
                ' ',
                ]).reshape((4, 1))
        vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
        vect.fit(corpus[:3].ravel())
        model_onnx = convert_sklearn(vect, 'TfidfVectorizer',
                                     [('input', StringTensorType([1]))],
                                     options=self.get_options(),
                                     target_opset=TARGET_OPSET)
        self.assertTrue(model_onnx is not None)

        # TfidfVectorizer in onnxruntime fails with empty strings,
        # which was fixed in version 0.3.0 afterward
        dump_data_and_model(
            corpus[2:], vect, model_onnx,
            basename="SklearnTfidfVectorizer11EmptyStringSepCase1-"
                     "OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.4.0')")
 def test_one_hot_encoder_mixed_string_int_drop(self):
     data = [
         ["c0.4", "c0.2", 3],
         ["c1.4", "c1.2", 0],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
         ["c0.2", "c2.2", 1],
     ]
     test = [["c0.2", "c2.2", 1]]
     model = OneHotEncoder(categories="auto", drop=['c0.4', 'c0.2', 3])
     model.fit(data)
     inputs = [
         ("input1", StringTensorType([None, 2])),
         ("input2", Int64TensorType([None, 1])),
     ]
     model_onnx = convert_sklearn(model, "one-hot encoder", inputs)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(test,
                         model,
                         model_onnx,
                         verbose=False,
                         basename="SklearnOneHotEncoderMixedStringIntDrop")
Exemple #4
0
 def test_model_tfidf_vectorizer11_compose(self):
     corpus = numpy.array([
         "This is the first document.",
         "This document is the second document.",
         "And this is the third one.",
         "Is this the first document?",
     ]).reshape((4, 1))
     corpus = numpy.hstack([corpus, corpus])
     y = numpy.array([0, 1, 0, 1])
     model = ColumnTransformer([
         ('a', TfidfVectorizer(), 0),
         ('b', TfidfVectorizer(), 1),
     ])
     model.fit(corpus, y)
     model_onnx = convert_sklearn(model,
                                  "TfIdfcomp",
                                  [("input", StringTensorType([4, 2]))],
                                  options=self.get_options(),
                                  target_opset=TARGET_OPSET)
     sess = InferenceSession(model_onnx.SerializeToString())
     res = sess.run(None, {'input': corpus})[0]
     exp = model.transform(corpus)
     assert_almost_equal(res, exp)
 def test_model_dict_vectorizer(self):
     model = DictVectorizer()
     data = [{"amy": 1.0, "chin": 200.0}, {"nice": 3.0, "amy": 1.0}]
     model.fit_transform(data)
     model_onnx = convert_sklearn(
         model,
         "dictionary vectorizer",
         [(
             "input",
             DictionaryType(StringTensorType([1]), FloatTensorType([1])),
         )],
     )
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         data,
         model,
         model_onnx,
         basename="SklearnDictVectorizer-OneOff-SkipDim1",
         allow_failure="StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.1.3') or "
         "StrictVersion(onnx.__version__)"
         " < StrictVersion('1.3.0')",
     )
 def test_model_tfidf_vectorizer11_empty_string_case2(self):
     corpus = numpy.array([
         "This is the first document.",
         "This document is the second document.",
         "And this is the third one.",
         "",
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     model_onnx = convert_sklearn(vect, "TfidfVectorizer",
                                  [("input", StringTensorType([1]))],
                                  options=self.get_options(),
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     # onnxruntime fails with empty strings
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11EmptyString-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__)"
                       " <= StrictVersion('0.4.0')",
     )
Exemple #7
0
 def test_model_tfidf_vectorizer_binary(self):
     corpus = numpy.array([
         "This is the first document.",
         "This document is the second document.",
         "And this is the third one.",
         "Is this the first document?",
     ]).reshape((4, 1))
     vect = TfidfVectorizer(binary=True)
     vect.fit(corpus.ravel())
     model_onnx = convert_sklearn(vect,
                                  "TfidfVectorizer",
                                  [("input", StringTensorType([1]))],
                                  options=self.get_options(),
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizerBinary-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.4.0')",
     )
Exemple #8
0
        def custom_parser(scope, model, inputs, custom_parsers=None):
            if custom_parsers is not None and model in custom_parsers:
                return custom_parsers[model](scope,
                                             model,
                                             inputs,
                                             custom_parsers=custom_parsers)
            if all(
                    isinstance(i, (numbers.Real, bool, np.bool_))
                    for i in model.classes_):
                label_type = Int64TensorType()
            else:
                label_type = StringTensorType()
            output_label = scope.declare_local_variable(
                'output_label', label_type)

            this_operator = scope.declare_local_operator(
                'LgbmClassifier', model)
            this_operator.inputs = inputs
            probability_map_variable = scope.declare_local_variable(
                'output_probability',
                SequenceType(DictionaryType(label_type, scope.tensor_type())))
            this_operator.outputs.append(output_label)
            this_operator.outputs.append(probability_map_variable)
            return this_operator.outputs
    def test_model_tfidf_vectorizer11_nolowercase(self):
        corpus = numpy.array([
            "This is the first document.",
            "This document is the second document.",
            "And this is the third one.",
            "Is this the first document?",
        ]).reshape((4, 1))
        vect = TfidfVectorizer(ngram_range=(1, 1), norm=None, lowercase=False)
        vect.fit(corpus.ravel())
        model_onnx = convert_sklearn(vect,
                                     "TfidfVectorizer",
                                     [("input", StringTensorType())],
                                     options=self.get_options(),
                                     target_opset=TARGET_OPSET)
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            corpus,
            vect,
            model_onnx,
            basename="SklearnTfidfVectorizer11NoL-OneOff-SklCol")

        sess = InferenceSession(model_onnx.SerializeToString())
        res = sess.run(None, {'input': corpus.ravel()})[0]
        assert res.shape == (4, 11)
 def test_model_tfidf_vectorizer11_out_vocabulary(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the first document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     model_onnx = convert_sklearn(vect, 'TfidfVectorizer',
                                  [('input', StringTensorType([1]))],
                                  options=self.get_options())
     self.assertTrue(model_onnx is not None)
     corpus = numpy.array([
         'AZZ ZZ This is the first document.',
         'BZZ ZZ This document is the second document.',
         'ZZZ ZZ And this is the third one.',
         'WZZ ZZ Is this the first document?',
     ]).reshape((4, 1))
     dump_data_and_model(
         corpus, vect, model_onnx,
         basename="SklearnTfidfVectorizer11OutVocabRegex-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__) <= "
                       "StrictVersion('0.4.0')")
Exemple #11
0
 def test_model_tfidf_vectorizer11_word4(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the first document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1),
                            norm=None,
                            token_pattern="[a-zA-Z]{1,4}")
     vect.fit(corpus.ravel())
     model_onnx = convert_sklearn(vect,
                                  'TfidfVectorizer',
                                  [('input', StringTensorType([1]))],
                                  options=self.get_options(),
                                  target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         corpus,
         vect,
         model_onnx,
         basename="SklearnTfidfVectorizer11Regex4-OneOff-SklCol",
         allow_failure="StrictVersion(onnxruntime.__version__) <= "
         "StrictVersion('0.4.0')")
 def test_model_tfidf_transform_bug(self):
     categories = [
         "alt.atheism",
         "soc.religion.christian",
         "comp.graphics",
         "sci.med",
     ]
     twenty_train = fetch_20newsgroups(subset="train",
                                       categories=categories,
                                       shuffle=True,
                                       random_state=0)
     text_clf = Pipeline([("vect", CountVectorizer()),
                          ("tfidf", TfidfTransformer())])
     twenty_train.data[0] = "bruît " + twenty_train.data[0]
     text_clf.fit(twenty_train.data, twenty_train.target)
     model_onnx = convert_sklearn(text_clf,
                                  name="DocClassifierCV-Tfidf",
                                  initial_types=[("input",
                                                  StringTensorType([5]))],
                                  target_opset=TARGET_OPSET)
     dump_data_and_model(twenty_train.data[5:10],
                         text_clf,
                         model_onnx,
                         basename="SklearnPipelineTfidfTransformer")
Exemple #13
0
 def test_model_tfidf_vectorizer11_opset(self):
     corpus = numpy.array([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the first document?',
     ]).reshape((4, 1))
     vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
     vect.fit(corpus.ravel())
     for opset in range(8, TARGET_OPSET + 1):
         try:
             model_onnx = convert_sklearn(
                 vect,
                 'TfidfVectorizer', [('input', StringTensorType([1]))],
                 options=self.get_options(),
                 target_opset=opset)
         except RuntimeError as e:
             if "only works for opset" in str(e):
                 continue
             raise e
         self.assertTrue(model_onnx is not None)
         if opset >= 10:
             name = "SklearnTfidfVectorizer11Rx%d-OneOff-SklCol" % opset
             dump_data_and_model(corpus, vect, model_onnx, basename=name)
    def test_model_tfidf_vectorizer11_short_word_spaces(self):
        corpus = numpy.array([
            'This is  the  first document.',
            'This document is the second  document.',
        ]).reshape((2, 1))
        vect = TfidfVectorizer(ngram_range=(1, 1),
                               norm=None,
                               analyzer='word',
                               token_pattern=".{1,3}")
        vect.fit(corpus.ravel())
        model_onnx = convert_sklearn(vect,
                                     'TfidfVectorizer',
                                     [('input', StringTensorType([1]))],
                                     target_opset=TARGET_OPSET)
        self.assertTrue(model_onnx is not None)

        dump_data_and_model(
            corpus,
            vect,
            model_onnx,
            basename="SklearnTfidfVectorizer11CharW2-OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__) <= "
            "StrictVersion('0.3.0')",
            verbose=False)
Exemple #15
0
    def common_test_model_tfidf_vectorizer_pipeline_cls(
            self, kind=None, verbose=False):
        if kind == 'stop':
            if StrictVersion(ort_version) >= StrictVersion('1.4.0'):
                # regression with stopwords in onnxruntime 1.4+
                stopwords = ['theh']
            else:
                stopwords = ['the', 'and', 'is']
        else:
            stopwords = None
        X_train = numpy.array([
            "This is the first document",
            "This document is the second document.",
            "And this is the third one",
            "Is this the first document?",
        ]).reshape((4, 1))
        y_train = numpy.array([0, 1, 0, 1])

        if kind is None:
            model_pipeline = Pipeline([
                ('vectorizer',
                 TfidfVectorizer(stop_words=stopwords,
                                 lowercase=True,
                                 use_idf=True,
                                 ngram_range=(1, 3),
                                 max_features=30000)),
            ])
        elif kind == 'cls':
            model_pipeline = Pipeline([('vectorizer',
                                        TfidfVectorizer(stop_words=stopwords,
                                                        lowercase=True,
                                                        use_idf=True,
                                                        ngram_range=(1, 3),
                                                        max_features=30000)),
                                       ('feature_selector', SelectKBest(k=10)),
                                       ('classifier',
                                        SVC(class_weight='balanced',
                                            kernel='rbf',
                                            gamma='scale',
                                            probability=True))])
        elif kind == 'stop':
            model_pipeline = Pipeline([
                ('vectorizer',
                 CountVectorizer(stop_words=stopwords,
                                 lowercase=True,
                                 ngram_range=(1, 2),
                                 max_features=30000)),
            ])
        elif kind == 'reg':
            model_pipeline = Pipeline([('vectorizer',
                                        TfidfVectorizer(stop_words=stopwords,
                                                        lowercase=True,
                                                        use_idf=True,
                                                        ngram_range=(1, 3),
                                                        max_features=30000)),
                                       ('feature_selector', SelectKBest(k=10)),
                                       ('classifier',
                                        SVR(kernel='rbf', gamma='scale'))])
        else:
            raise AssertionError(kind)

        model_pipeline.fit(X_train.ravel(), y_train)
        initial_type = [('input', StringTensorType([None, 1]))]
        model_onnx = convert_sklearn(model_pipeline,
                                     "cv",
                                     initial_types=initial_type,
                                     options={SVC: {
                                         'zipmap': False
                                     }},
                                     target_opset=TARGET_OPSET)

        if kind in (None, 'stop'):
            exp = [model_pipeline.transform(X_train.ravel()).toarray()]
        elif kind == 'cls':
            exp = [
                model_pipeline.predict(X_train.ravel()),
                model_pipeline.predict_proba(X_train.ravel())
            ]
        elif kind == 'reg':
            exp = [model_pipeline.predict(X_train.ravel()).reshape((-1, 1))]

        sess = InferenceSession(model_onnx.SerializeToString())
        got = sess.run(None, {'input': X_train})
        if verbose:
            voc = model_pipeline.steps[0][-1].vocabulary_
            voc = list(sorted([(v, k) for k, v in voc.items()]))
            for kv in voc:
                print(kv)
        for a, b in zip(exp, got):
            if verbose:
                print(stopwords)
                print(a)
                print(b)
            assert_almost_equal(a, b)
Exemple #16
0
    def test_pipeline_column_transformer(self):

        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1"
                                              if x > 0.5 else "cat2")
        X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3"
                                               if x > 0.5 else "cat4")
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="lbfgs",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])

        categorical_transformer = Pipeline(steps=[
            (
                "onehot",
                OneHotEncoder(sparse=True, handle_unknown="ignore"),
            ),
            (
                "tsvd",
                TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4),
            ),
        ])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)
        initial_type = [
            ("numfeat", FloatTensorType([None, 3])),
            ("strfeat", StringTensorType([None, 2])),
        ]

        X_train = X_train[:11]
        model_onnx = convert_sklearn(model, initial_types=initial_type)

        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipeliner",
            allow_failure="StrictVersion(onnx.__version__)"
                          " < StrictVersion('1.3') or "
                          "StrictVersion(onnxruntime.__version__)"
                          " <= StrictVersion('0.4.0')",
        )

        if __name__ == "__main__":
            from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer

            pydot_graph = GetPydotGraph(
                model_onnx.graph,
                name=model_onnx.graph.name,
                rankdir="TP",
                node_producer=GetOpNodeProducer("docstring"),
            )
            pydot_graph.write_dot("graph.dot")

            import os

            os.system("dot -O -G=300 -Tpng graph.dot")
Exemple #17
0
# The default one used by *scikit-learn* uses regular expressions
# and is currently being implementing. The current implementation
# only considers a list of separators which can is defined
# in variable *seps*.

seps = {
    TfidfVectorizer: {
        "sep": [
            ' ', '.', '\\?', ',', ';', ':', '!', '\\(', '\\)', '\n', '"', "'",
            "-", "\\[", "\\]", "@"
        ]
    }
}
model_onnx = convert_sklearn(pipeline,
                             "tfidf",
                             initial_types=[("input", StringTensorType([1,
                                                                        2]))],
                             options=seps)

#################################
# And save.
with open("pipeline_tfidf.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

##########################
# Predictions with onnxruntime.

sess = rt.InferenceSession("pipeline_tfidf.onnx")
print('---', train_data[0])
inputs = {'input': train_data[0]}
pred_onx = sess.run(None, inputs)
print("predict", pred_onx[0])
 def test_pipeline_tfidf_pipeline_minmax(self):
     categories = ["alt.atheism", "talk.religion.misc"]
     try:
         train = fetch_20newsgroups(random_state=1,
                                    subset="train",
                                    categories=categories)
     except urllib.error.URLError:
         warnings.warn("Unit test may fail due to connectivity issue.")
         return
     train_data = SubjectBodyExtractor().fit_transform(train.data)
     pipeline = Pipeline([(
         "union",
         ColumnTransformer(
             [
                 ("subject", TfidfVectorizer(min_df=50), 0),
                 ("body", TfidfVectorizer(min_df=40), 1),
             ],
             transformer_weights={"subject": 0.8},
         ),
     )])
     pipeline.fit(train_data[:300])
     extra = {
         TfidfVectorizer: {
             "separators": [
                 " ",
                 "[.]",
                 "\\?",
                 ",",
                 ";",
                 ":",
                 "\\!",
                 "\\(",
                 "\\)",
                 "\n",
                 '"',
                 "'",
                 "-",
                 "\\[",
                 "\\]",
                 "@",
             ]
         }
     }
     model_onnx = convert_sklearn(pipeline,
                                  "tfidf",
                                  initial_types=[
                                      ("input", StringTensorType([None, 2]))
                                  ],
                                  options=extra,
                                  target_opset=TARGET_OPSET)
     test_data = np.array([
         ["Albert Einstein", "Not relatively."],
         ["Alan turing", "Not automatically."],
     ])
     dump_data_and_model(
         test_data,
         pipeline,
         model_onnx,
         verbose=False,
         basename="SklearnDocumentationTfIdfUnion1",
         allow_failure="StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.4.0')",
     )
Exemple #19
0
def make_pipelines(df_train,
                   y_train,
                   models=None,
                   sparse_threshold=1.,
                   replace_nan=False,
                   insert_replace=False,
                   verbose=False):

    if models is None:
        models = [
            RandomForestClassifier, HistGradientBoostingClassifier,
            XGBClassifier, LGBMClassifier
        ]
    models = [_ for _ in models if _ is not None]

    pipes = []
    for model in tqdm(models):

        if model == HistGradientBoostingClassifier:
            kwargs = dict(max_iter=5)
        elif model == XGBClassifier:
            kwargs = dict(n_estimators=5, use_label_encoder=False)
        else:
            kwargs = dict(n_estimators=5)

        if insert_replace:
            pipe = Pipeline([
                ('union',
                 ColumnTransformer([
                     ('scale1', StandardScaler(), [0, 1]),
                     ('subject',
                      Pipeline([
                          ('count', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('repl', ReplaceTransformer()),
                      ]), "text"),
                 ],
                                   sparse_threshold=sparse_threshold)),
                ('cast', CastTransformer()),
                ('cls', model(max_depth=3, **kwargs)),
            ])
        else:
            pipe = Pipeline([
                ('union',
                 ColumnTransformer([
                     ('scale1', StandardScaler(), [0, 1]),
                     ('subject',
                      Pipeline([('count', CountVectorizer()),
                                ('tfidf', TfidfTransformer())]), "text"),
                 ],
                                   sparse_threshold=sparse_threshold)),
                ('cast', CastTransformer()),
                ('cls', model(max_depth=3, **kwargs)),
            ])

        try:
            pipe.fit(df_train, y_train)
        except TypeError as e:
            obs = dict(model=model.__name__, pipe=pipe, error=e)
            pipes.append(obs)
            continue

        options = {model: {'zipmap': False}}
        if replace_nan:
            options[TfidfTransformer] = {'nan': True}

        # convert
        with warnings.catch_warnings(record=False):
            warnings.simplefilter("ignore", (FutureWarning, UserWarning))
            model_onnx = to_onnx(pipe,
                                 initial_types=[
                                     ('input', FloatTensorType([None, 2])),
                                     ('text', StringTensorType([None, 1]))
                                 ],
                                 target_opset={
                                     '': 14,
                                     'ai.onnx.ml': 2
                                 },
                                 options=options)

        with open('model.onnx', 'wb') as f:
            f.write(model_onnx.SerializeToString())

        oinf = OnnxInference(model_onnx)
        inputs = {
            "input": df[[0, 1]].values.astype(numpy.float32),
            "text": df[["text"]].values
        }
        pred_onx = oinf.run(inputs)

        diff = numpy.abs(pred_onx['probabilities'].ravel() -
                         pipe.predict_proba(df).ravel()).sum()

        if verbose:

            def td(a):
                if hasattr(a, 'todense'):
                    b = a.todense()
                    ind = set(a.indices)
                    for i in range(b.shape[1]):
                        if i not in ind:
                            b[0, i] = numpy.nan
                    return b
                return a

            oinf = OnnxInference(model_onnx)
            pred_onx2 = oinf.run(inputs)
            diff2 = numpy.abs(pred_onx2['probabilities'].ravel() -
                              pipe.predict_proba(df).ravel()).sum()

        if diff > 0.1:
            for i, (l1, l2) in enumerate(
                    zip(pipe.predict_proba(df), pred_onx['probabilities'])):
                d = numpy.abs(l1 - l2).sum()
                if verbose and d > 0.1:
                    print("\nDISCREPENCY DETAILS")
                    print(d, i, l1, l2)
                    pre = pipe.steps[0][-1].transform(df)
                    print("idf", pre[i].dtype, td(pre[i]))
                    pre2 = pipe.steps[1][-1].transform(pre)
                    print("cas", pre2[i].dtype, td(pre2[i]))
                    inter = oinf.run(inputs, intermediate=True)
                    onx = inter['tfidftr_norm']
                    print("onx", onx.dtype, onx[i])
                    onx = inter['variable3']

        obs = dict(model=model.__name__,
                   discrepencies=diff,
                   model_onnx=model_onnx,
                   pipe=pipe)
        if verbose:
            obs['discrepency2'] = diff2
        pipes.append(obs)

    return pipes
Exemple #20
0
# and is currently being implementing. The current implementation
# only considers a list of separators which can is defined
# in variable *seps*.

seps = {
    TfidfVectorizer: {
        "separators": [
            ' ', '.', '\\?', ',', ';', ':', '!', '\\(', '\\)', '\n', '"', "'",
            "-", "\\[", "\\]", "@"
        ]
    }
}
model_onnx = convert_sklearn(pipeline,
                             "tfidf",
                             initial_types=[("input",
                                             StringTensorType([None, 2]))],
                             options=seps,
                             target_opset=12)

#################################
# And save.
with open("pipeline_tfidf.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

##########################
# Predictions with onnxruntime.

sess = rt.InferenceSession("pipeline_tfidf.onnx")
print('---', train_data[0])
inputs = {'input': train_data[:1]}
pred_onx = sess.run(None, inputs)
    def test_pipeline_column_transformer(self):

        iris = load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1"
                                              if x > 0.5 else "cat2")
        X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3"
                                               if x > 0.5 else "cat4")
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(C=0.01,
                                        class_weight=dict(
                                            zip([False, True], [0.2, 0.8])),
                                        n_jobs=1,
                                        max_iter=10,
                                        solver="lbfgs",
                                        tol=1e-3)

        numeric_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(
                strategy="median")), ("scaler", StandardScaler())])

        categorical_transformer = Pipeline(steps=[(
            "onehot", OneHotEncoder(sparse=True, handle_unknown="ignore")
        ), ("tsvd",
            TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4))])

        preprocessor = ColumnTransformer(
            transformers=[("num", numeric_transformer, numeric_features),
                          ("cat", categorical_transformer,
                           categorical_features)])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)
        names = list(enumerate_model_names(model, short=False))
        simple = [_[0] for _ in names]
        assert len(set(simple)) == len(simple)
        names = list(enumerate_model_names(model))
        simple2 = [_[0] for _ in names]
        assert len(simple2) == len(simple)
        exp = [
            '', 'precprocessor', 'precprocessor__num',
            'precprocessor__num__imputer', 'precprocessor__num__scaler',
            'precprocessor__cat', 'precprocessor__cat__onehot',
            'precprocessor__cat__onehot__categories___0',
            'precprocessor__cat__onehot__categories___1',
            'precprocessor__cat__tsvd', 'classifier'
        ]
        self.assertEqual(simple2[:len(exp) - 2], exp[:-2])

        initial_type = [("numfeat", FloatTensorType([None, 3])),
                        ("strfeat", StringTensorType([None, 2]))]
        model_onnx = convert_sklearn(model,
                                     initial_types=initial_type,
                                     target_opset=TARGET_OPSET)
        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipelinerOptions1")

        options = {'classifier': {'zipmap': False}}
        new_options = _process_options(model, options)
        assert len(new_options) == 2

        model_onnx = convert_sklearn(model,
                                     initial_types=initial_type,
                                     options={'classifier': {
                                         'zipmap': False
                                     }},
                                     target_opset=TARGET_OPSET)
        assert 'zipmap' not in str(model_onnx).lower()
        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipelinerOptions2")

        options = {'classifier__zipmap': False}
        new_options = _process_options(model, options)
        assert len(new_options) == 2

        model_onnx = convert_sklearn(model,
                                     initial_types=initial_type,
                                     options=options,
                                     target_opset=TARGET_OPSET)
        assert 'zipmap' not in str(model_onnx).lower()
        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipelinerOptions2")

        options = {id(model): {'zipmap': False}}
        new_options = _process_pipeline_options(model, options)

        model_onnx = convert_sklearn(model,
                                     initial_types=initial_type,
                                     options={id(model): {
                                                  'zipmap': False
                                              }},
                                     target_opset=TARGET_OPSET)
        assert 'zipmap' not in str(model_onnx).lower()
        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipelinerOptions2")
def make_pipelines(df_train, y_train, models=None,
                   sparse_threshold=1., replace_nan=False,
                   insert_replace=False):

    if models is None:
        models = [
            RandomForestClassifier, HistGradientBoostingClassifier,
            XGBClassifier, LGBMClassifier]
    models = [_ for _ in models if _ is not None]

    pipes = []
    for model in tqdm(models):

        if model == HistGradientBoostingClassifier:
            kwargs = dict(max_iter=5)
        elif model == XGBClassifier:
            kwargs = dict(n_estimators=5, use_label_encoder=False)
        else:
            kwargs = dict(n_estimators=5)

        if insert_replace:
            pipe = Pipeline([
                ('union', ColumnTransformer([
                    ('scale1', StandardScaler(), [0, 1]),
                    ('subject',
                     Pipeline([
                         ('count', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('repl', ReplaceTransformer()),
                     ]), "text"),
                ], sparse_threshold=sparse_threshold)),
                ('cast', CastTransformer()),
                ('cls', model(max_depth=3, **kwargs)),
            ])
        else:
            pipe = Pipeline([
                ('union', ColumnTransformer([
                    ('scale1', StandardScaler(), [0, 1]),
                    ('subject',
                     Pipeline([
                         ('count', CountVectorizer()),
                         ('tfidf', TfidfTransformer())
                     ]), "text"),
                ], sparse_threshold=sparse_threshold)),
                ('cast', CastTransformer()),
                ('cls', model(max_depth=3, **kwargs)),
            ])

        try:
            pipe.fit(df_train, y_train)
        except TypeError as e:
            obs = dict(model=model.__name__, pipe=pipe, error=e)
            pipes.append(obs)
            continue

        options = {model: {'zipmap': False}}
        if replace_nan:
            options[TfidfTransformer] = {'nan': True}

        # convert
        with warnings.catch_warnings(record=False):
            warnings.simplefilter("ignore", (FutureWarning, UserWarning))
            model_onnx = to_onnx(
                pipe,
                initial_types=[('input', FloatTensorType([None, 2])),
                               ('text', StringTensorType([None, 1]))],
                target_opset=12, options=options)

        with open('model.onnx', 'wb') as f:
            f.write(model_onnx.SerializeToString())

        sess = rt.InferenceSession(model_onnx.SerializeToString())
        inputs = {"input": df[[0, 1]].values.astype(numpy.float32),
                  "text": df[["text"]].values}
        pred_onx = sess.run(None, inputs)

        diff = numpy.abs(
            pred_onx[1].ravel() -
            pipe.predict_proba(df).ravel()).sum()

        obs = dict(model=model.__name__,
                   discrepencies=diff,
                   model_onnx=model_onnx, pipe=pipe)
        pipes.append(obs)

    return pipes
    'And this is the third one.',
    ' ',
]).reshape((4, 1))
vect = TfidfVectorizer(ngram_range=(1, 2), norm=None)
vect.fit(corpus.ravel())
pred = vect.transform(corpus.ravel())

###########################
# Convert a model into ONNX
# +++++++++++++++++++++++++

from skl2onnx import convert_sklearn  # noqa
from skl2onnx.common.data_types import StringTensorType  # noqa

model_onnx = convert_sklearn(vect, 'TfidfVectorizer',
                             [('input', StringTensorType([1, 1]))])

with open("TfidfVectorizer.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

###########################
# Visualize
# +++++++++

from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer  # noqa
pydot_graph = GetPydotGraph(model_onnx.graph,
                            name=model_onnx.graph.name,
                            rankdir="TB",
                            node_producer=GetOpNodeProducer("docstring",
                                                            color="yellow",
                                                            fillcolor="yellow",
Exemple #24
0
# only considers a list of separators which can is defined
# in variable *seps*.


seps = {
    TfidfVectorizer: {
        "separators": [
            ' ', '.', '\\?', ',', ';', ':', '!',
            '\\(', '\\)', '\n', '"', "'",
            "-", "\\[", "\\]", "@"
        ]
    }
}
model_onnx = convert_sklearn(pipeline, "tfidf",
                             initial_types=[
                                 ("input", StringTensorType([None, 2]))],
                             options=seps)

#################################
# And save.
with open("pipeline_tfidf.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

##########################
# Predictions with onnxruntime.

sess = rt.InferenceSession("pipeline_tfidf.onnx")
print('---', train_data[0])
inputs = {'input': train_data[:1]}
pred_onx = sess.run(None, inputs)
print("predict", pred_onx[0])