Exemple #1
0
    def test_pipeline(self):
        def maxdiff(a1, a2):
            d = numpy.abs(a1.ravel() - a2.ravel())
            return d.max()

        X, y = make_regression(10000, 10, random_state=3)
        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=3)
        Xi_train, yi_train = X_train.copy(), y_train.copy()
        Xi_test = X_test.copy()
        for i in range(X.shape[1]):
            Xi_train[:,
                     i] = (Xi_train[:, i] * math.pi * 2**i).astype(numpy.int64)
            Xi_test[:,
                    i] = (Xi_test[:, i] * math.pi * 2**i).astype(numpy.int64)
        max_depth = 10
        Xi_test = Xi_test.astype(numpy.float32)

        # model 1
        model1 = Pipeline([('scaler', StandardScaler()),
                           ('dt', DecisionTreeRegressor(max_depth=max_depth))])
        model1.fit(Xi_train, yi_train)
        exp1 = model1.predict(Xi_test)
        onx1 = to_onnx(model1,
                       X_train[:1].astype(numpy.float32),
                       target_opset=TARGET_OPSET)
        sess1 = InferenceSession(onx1.SerializeToString())
        got1 = sess1.run(None, {'X': Xi_test})[0]
        md1 = maxdiff(exp1, got1)

        # model 2
        model2 = Pipeline([
            ('cast64', CastTransformer(dtype=numpy.float64)),
            ('scaler', StandardScaler()), ('cast', CastTransformer()),
            ('dt',
             CastRegressor(DecisionTreeRegressor(max_depth=max_depth),
                           dtype=numpy.float32))
        ])
        model2.fit(Xi_train, yi_train)
        exp2 = model2.predict(Xi_test)
        onx = to_onnx(model2,
                      X_train[:1].astype(numpy.float32),
                      options={StandardScaler: {
                          'div': 'div_cast'
                      }},
                      target_opset=TARGET_OPSET)
        sess2 = InferenceSession(onx.SerializeToString())
        got2 = sess2.run(None, {'X': Xi_test})[0]
        md2 = maxdiff(exp2, got2)
        assert md2 <= md1
        assert md2 <= 0.0
Exemple #2
0
    def test_onnx_no_test_data_double(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        if CastTransformer is None:
            model = GradientBoostingClassifier(n_estimators=10,
                                               max_depth=max_depth)
        else:
            # newer version of sklearn-onnx
            model = make_pipeline(
                CastTransformer(dtype=np.float32),
                GradientBoostingClassifier(n_estimators=10,
                                           max_depth=max_depth))
        np.random.seed(0)
        X = np.random.rand(100, 200)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        # Create ONNX-ML model
        onnx_ml_model = convert_sklearn(
            model,
            initial_types=[("input", DoubleTensorType([None, X.shape[1]]))],
            target_opset=11)

        # Test onnx requires no test_data
        hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx")
        assert hb_model
 def common_test_cast_transformer(self, dtype, input_type):
     model = Pipeline([
         ('cast', CastTransformer(dtype=dtype)),
         ('invcast', CastTransformer(dtype=numpy.float32)),
     ])
     data = numpy.array([[0.1, 0.2, 3.1], [1, 1, 0],
                         [0, 2, 1], [1, 0, 2]],
                        dtype=numpy.float32)
     model.fit(data)
     pred = model.steps[0][1].transform(data)
     assert pred.dtype == dtype
     model_onnx = convert_sklearn(
         model, "cast", [("input", FloatTensorType([None, 3]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         data, model, model_onnx,
         basename="SklearnCastTransformer{}".format(
             input_type.__class__.__name__))
 def test_cast_transformer_dataframe(self):
     model = Pipeline([
         ('prep', ColumnTransformer([
             ('prep1', CastTransformer(), [0, 1]),
             ('prep2', CastTransformer(), [2]),
         ])),
         ('invcast', CastTransformer(dtype=numpy.float32)),
     ])
     data = numpy.array([[0.1, 0.2, 3.4], [1, 1, 0],
                         [0, 2, 1], [1, 0, 2]],
                        dtype=numpy.float32)
     data = DataFrame(data)
     model.fit(data)
     model_onnx = convert_sklearn(
         model, "cast", [("input", FloatTensorType([None, 3]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         data.values, model, model_onnx,
         basename="SklearnCastTransformerCT")
#
# We could try to use double everywhere. Unfortunately,
# :epkg:`ONNX ML Operators` only allows float coefficients
# for the operator *TreeEnsembleRegressor*. We may want
# to compromise by casting the output of the normalizer into
# float in the :epkg:`scikit-learn` pipeline.
#
# .. blockdiag::
#
#    diagram {
#      x_float32 -> normalizer -> y_double ->
#      cast -> y_float -> dtree -> z_float
#    }
#

model2 = Pipeline([('scaler', StandardScaler()), ('cast', CastTransformer()),
                   ('dt', DecisionTreeRegressor(max_depth=max_depth))])

model2.fit(Xi_train, yi_train)

##########################################
# The discrepencies.

onx2 = to_onnx(model2, Xi_train[:1].astype(numpy.float32))

sess2 = InferenceSession(onx2.SerializeToString(),
                         providers=['CPUExecutionProvider'])

skl2 = model2.predict(X32)
ort2 = sess2.run(None, {'X': X32})[0]
    def test_pandas_batch_onnxml(self):
        import pandas

        max_depth = 10
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        columns = ["vA", "vB", "vC"]
        X_train = pandas.DataFrame(X, columns=columns)

        if CastTransformer is None:
            pipeline = Pipeline(steps=[
                ("preprocessor",
                 ColumnTransformer(
                     transformers=[],
                     remainder="passthrough",
                 )),
                ("classifier",
                 GradientBoostingClassifier(n_estimators=10,
                                            max_depth=max_depth)),
            ])
        else:
            # newer version of sklearn-onnx
            pipeline = Pipeline(steps=[
                ("preprocessor",
                 ColumnTransformer(
                     transformers=[],
                     remainder="passthrough",
                 )),
                ('cast', CastTransformer(dtype=np.float32)),
                ("classifier",
                 GradientBoostingClassifier(n_estimators=10,
                                            max_depth=max_depth)),
            ])

        pipeline.fit(X_train, y)

        # Create ONNX-ML model
        onnx_ml_model = convert_sklearn(
            pipeline,
            initial_types=[
                ("vA", DoubleTensorType([X.shape[0], 1])),
                ("vB", DoubleTensorType([X.shape[0], 1])),
                ("vC", DoubleTensorType([X.shape[0], 1])),
            ],
            target_opset=9,
        )

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(
            onnx_ml_model, "onnx",
            pandas.DataFrame(X[:batch_size], columns=columns), remainder_size)

        self.assertTrue(hb_model is not None)

        np.testing.assert_allclose(
            pipeline.predict_proba(X_train),
            hb_model.predict_proba(X_train),
            rtol=1e-06,
            atol=1e-06,
        )
# ++++++++++++
#
# Fixing the conversion requires to replace ``(x * (1 / y)``
# by ``(x / y)`` and this division must happen in double.
# By default, the *sklearn-onnx* assumes every
# computer should happen in float. `ONNX 1.7 specifications
# <https://github.com/onnx/onnx/blob/master/docs/
# Operators-ml.md#ai.onnx.ml.Scaler>`_
# does not support double scaling (input and output does,
# but not the parameters). The solution needs to
# change the conversion (remove node Scaler by using option
# `'div'`) and to use double by inserting an explicit
# Cast.

model2 = Pipeline([
    ('cast64', CastTransformer(dtype=np.float64)),
    ('scaler', StandardScaler()),
    ('cast', CastTransformer()),
    ('dt', DecisionTreeRegressor(max_depth=max_depth))
])

model2.fit(Xi_train, yi_train)
exp2 = model2.predict(Xi_test)

onx2 = to_onnx(model2, X_train[:1].astype(np.float32),
               options={StandardScaler: {'div': 'div_cast'}},
               target_opset=15)

sess2 = InferenceSession(onx2.SerializeToString())
got2 = sess2.run(None, {'X': Xi_test})[0]
md2 = maxdiff(exp2, got2)
Exemple #8
0
def make_pipelines(df_train,
                   y_train,
                   models=None,
                   sparse_threshold=1.,
                   replace_nan=False,
                   insert_replace=False,
                   verbose=False):

    if models is None:
        models = [
            RandomForestClassifier, HistGradientBoostingClassifier,
            XGBClassifier, LGBMClassifier
        ]
    models = [_ for _ in models if _ is not None]

    pipes = []
    for model in tqdm(models):

        if model == HistGradientBoostingClassifier:
            kwargs = dict(max_iter=5)
        elif model == XGBClassifier:
            kwargs = dict(n_estimators=5, use_label_encoder=False)
        else:
            kwargs = dict(n_estimators=5)

        if insert_replace:
            pipe = Pipeline([
                ('union',
                 ColumnTransformer([
                     ('scale1', StandardScaler(), [0, 1]),
                     ('subject',
                      Pipeline([
                          ('count', CountVectorizer()),
                          ('tfidf', TfidfTransformer()),
                          ('repl', ReplaceTransformer()),
                      ]), "text"),
                 ],
                                   sparse_threshold=sparse_threshold)),
                ('cast', CastTransformer()),
                ('cls', model(max_depth=3, **kwargs)),
            ])
        else:
            pipe = Pipeline([
                ('union',
                 ColumnTransformer([
                     ('scale1', StandardScaler(), [0, 1]),
                     ('subject',
                      Pipeline([('count', CountVectorizer()),
                                ('tfidf', TfidfTransformer())]), "text"),
                 ],
                                   sparse_threshold=sparse_threshold)),
                ('cast', CastTransformer()),
                ('cls', model(max_depth=3, **kwargs)),
            ])

        try:
            pipe.fit(df_train, y_train)
        except TypeError as e:
            obs = dict(model=model.__name__, pipe=pipe, error=e)
            pipes.append(obs)
            continue

        options = {model: {'zipmap': False}}
        if replace_nan:
            options[TfidfTransformer] = {'nan': True}

        # convert
        with warnings.catch_warnings(record=False):
            warnings.simplefilter("ignore", (FutureWarning, UserWarning))
            model_onnx = to_onnx(pipe,
                                 initial_types=[
                                     ('input', FloatTensorType([None, 2])),
                                     ('text', StringTensorType([None, 1]))
                                 ],
                                 target_opset={
                                     '': 14,
                                     'ai.onnx.ml': 2
                                 },
                                 options=options)

        with open('model.onnx', 'wb') as f:
            f.write(model_onnx.SerializeToString())

        oinf = OnnxInference(model_onnx)
        inputs = {
            "input": df[[0, 1]].values.astype(numpy.float32),
            "text": df[["text"]].values
        }
        pred_onx = oinf.run(inputs)

        diff = numpy.abs(pred_onx['probabilities'].ravel() -
                         pipe.predict_proba(df).ravel()).sum()

        if verbose:

            def td(a):
                if hasattr(a, 'todense'):
                    b = a.todense()
                    ind = set(a.indices)
                    for i in range(b.shape[1]):
                        if i not in ind:
                            b[0, i] = numpy.nan
                    return b
                return a

            oinf = OnnxInference(model_onnx)
            pred_onx2 = oinf.run(inputs)
            diff2 = numpy.abs(pred_onx2['probabilities'].ravel() -
                              pipe.predict_proba(df).ravel()).sum()

        if diff > 0.1:
            for i, (l1, l2) in enumerate(
                    zip(pipe.predict_proba(df), pred_onx['probabilities'])):
                d = numpy.abs(l1 - l2).sum()
                if verbose and d > 0.1:
                    print("\nDISCREPENCY DETAILS")
                    print(d, i, l1, l2)
                    pre = pipe.steps[0][-1].transform(df)
                    print("idf", pre[i].dtype, td(pre[i]))
                    pre2 = pipe.steps[1][-1].transform(pre)
                    print("cas", pre2[i].dtype, td(pre2[i]))
                    inter = oinf.run(inputs, intermediate=True)
                    onx = inter['tfidftr_norm']
                    print("onx", onx.dtype, onx[i])
                    onx = inter['variable3']

        obs = dict(model=model.__name__,
                   discrepencies=diff,
                   model_onnx=model_onnx,
                   pipe=pipe)
        if verbose:
            obs['discrepency2'] = diff2
        pipes.append(obs)

    return pipes
def make_pipelines(df_train, y_train, models=None,
                   sparse_threshold=1., replace_nan=False,
                   insert_replace=False):

    if models is None:
        models = [
            RandomForestClassifier, HistGradientBoostingClassifier,
            XGBClassifier, LGBMClassifier]
    models = [_ for _ in models if _ is not None]

    pipes = []
    for model in tqdm(models):

        if model == HistGradientBoostingClassifier:
            kwargs = dict(max_iter=5)
        elif model == XGBClassifier:
            kwargs = dict(n_estimators=5, use_label_encoder=False)
        else:
            kwargs = dict(n_estimators=5)

        if insert_replace:
            pipe = Pipeline([
                ('union', ColumnTransformer([
                    ('scale1', StandardScaler(), [0, 1]),
                    ('subject',
                     Pipeline([
                         ('count', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('repl', ReplaceTransformer()),
                     ]), "text"),
                ], sparse_threshold=sparse_threshold)),
                ('cast', CastTransformer()),
                ('cls', model(max_depth=3, **kwargs)),
            ])
        else:
            pipe = Pipeline([
                ('union', ColumnTransformer([
                    ('scale1', StandardScaler(), [0, 1]),
                    ('subject',
                     Pipeline([
                         ('count', CountVectorizer()),
                         ('tfidf', TfidfTransformer())
                     ]), "text"),
                ], sparse_threshold=sparse_threshold)),
                ('cast', CastTransformer()),
                ('cls', model(max_depth=3, **kwargs)),
            ])

        try:
            pipe.fit(df_train, y_train)
        except TypeError as e:
            obs = dict(model=model.__name__, pipe=pipe, error=e)
            pipes.append(obs)
            continue

        options = {model: {'zipmap': False}}
        if replace_nan:
            options[TfidfTransformer] = {'nan': True}

        # convert
        with warnings.catch_warnings(record=False):
            warnings.simplefilter("ignore", (FutureWarning, UserWarning))
            model_onnx = to_onnx(
                pipe,
                initial_types=[('input', FloatTensorType([None, 2])),
                               ('text', StringTensorType([None, 1]))],
                target_opset=12, options=options)

        with open('model.onnx', 'wb') as f:
            f.write(model_onnx.SerializeToString())

        sess = rt.InferenceSession(model_onnx.SerializeToString())
        inputs = {"input": df[[0, 1]].values.astype(numpy.float32),
                  "text": df[["text"]].values}
        pred_onx = sess.run(None, inputs)

        diff = numpy.abs(
            pred_onx[1].ravel() -
            pipe.predict_proba(df).ravel()).sum()

        obs = dict(model=model.__name__,
                   discrepencies=diff,
                   model_onnx=model_onnx, pipe=pipe)
        pipes.append(obs)

    return pipes