コード例 #1
0
class TestSparkMLVectorAssembler(unittest.TestCase):
    # Test VectorAssembler
    @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()), reason="Spark-ML test requires pyspark and pandas")
    @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Spark-ML test requires torch >= 1.6.0")
    def test_vectorassembler_converter(self):
        iris = load_iris()
        features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

        pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]], columns=features + ["target"])[
            ["sepal_length", "sepal_width", "petal_length", "petal_width"]
        ]
        df = sql.createDataFrame(pd_df)

        model = VectorAssembler(inputCols=features, outputCol="features")

        test_df = df
        torch_model = convert(model, "torch", test_df)
        self.assertTrue(torch_model is not None)

        spark_output = model.transform(test_df).toPandas()
        spark_output["features"] = spark_output["features"].map(lambda x: np.array(x.toArray()))
        spark_output_np = spark_output["features"].to_numpy()
        torch_output_np = torch_model.transform(pd_df)

        np.testing.assert_allclose(np.vstack(spark_output_np), torch_output_np, rtol=1e-06, atol=1e-06)
コード例 #2
0
class TestSparkMLDiscretizers(unittest.TestCase):
    # Test QuantileDiscretizer
    @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()),
                     reason="Spark-ML test requires pyspark and pandas")
    @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"),
                     reason="Spark-ML test requires torch >= 1.6.0")
    def test_quantilediscretizer_converter(self):
        iris = load_iris()
        features = [
            "sepal_length", "sepal_width", "petal_length", "petal_width"
        ]

        pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]],
                             columns=features + ["target"])
        df = sql.createDataFrame(pd_df).select("sepal_length")

        quantile = QuantileDiscretizer(inputCol="sepal_length",
                                       outputCol="sepal_length_bucket",
                                       numBuckets=2)
        model = quantile.fit(df)

        test_df = df
        torch_model = convert(model, "torch", test_df)
        self.assertTrue(torch_model is not None)

        spark_output = model.transform(test_df).select(
            "sepal_length_bucket").toPandas()
        torch_output_np = torch_model.transform(pd_df[["sepal_length"]])
        np.testing.assert_allclose(spark_output.to_numpy(),
                                   torch_output_np,
                                   rtol=1e-06,
                                   atol=1e-06)
コード例 #3
0
class TestSparkMLLinear(unittest.TestCase):
    def _test_linear(self, classes, model_class):
        n_features = 10
        n_total = 100
        np.random.seed(0)
        warnings.filterwarnings("ignore")
        X = np.random.rand(n_total, n_features)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(classes, size=(n_total, 1))

        arr = np.concatenate([y, X], axis=1).reshape(n_total, -1)
        df = map(lambda x: (int(x[0]), Vectors.dense(x[1:])), arr)
        df = sql.createDataFrame(df, schema=["label", "features"])

        model = model_class()
        model = model.fit(df)

        test_df = df.select("features").limit(10)
        torch_model = convert(model, "torch", test_df)
        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(
            np.array(
                model.transform(df).select("probability").collect()).reshape(
                    -1, classes),
            torch_model.predict_proba(X),
            rtol=1e-06,
            atol=1e-06,
        )

    # pyspark.ml.LogisticRegression with two classes
    @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"),
                     reason="Spark-ML test requires torch >= 1.6.0")
    @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()),
                     reason="Spark-ML test requires pyspark and pandas")
    def test_logistic_regression_binary(self):
        self._test_linear(2, model_class=LogisticRegression)

    # pyspark.ml.LogisticRegression with multi_class
    @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"),
                     reason="Spark-ML test requires torch >= 1.6.0")
    @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()),
                     reason="Spark-ML test requires pyspark and pandas")
    def test_logistic_regression_multi_class(self):
        self._test_linear(5, model_class=LogisticRegression)
コード例 #4
0
class TestExtraConf(unittest.TestCase):
    # Test default number of threads. It will only work on mac after 1.6 https://github.com/pytorch/pytorch/issues/43036
    @unittest.skipIf(
        sys.platform == "darwin" and LooseVersion(torch.__version__) <= LooseVersion("1.6.0"),
        reason="PyTorch has a bug on mac related to multi-threading",
    )
    def test_torch_deafault_n_threads(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        hb_model = hummingbird.ml.convert(model, "torch")

        self.assertIsNotNone(hb_model)
        self.assertTrue(torch.get_num_threads() == psutil.cpu_count(logical=False))
        self.assertTrue(torch.get_num_interop_threads() == 1)

    # Test one thread in pytorch.
    @unittest.skipIf(
        sys.platform == "darwin" and LooseVersion(torch.__version__) > LooseVersion("1.6.0"),
        reason="Setting threading multi times will break on mac",
    )
    def test_torch_one_thread(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        hb_model = hummingbird.ml.convert(model, "torch", extra_config={constants.N_THREADS: 1})

        self.assertIsNotNone(hb_model)
        self.assertTrue(torch.get_num_threads() == 1)
        self.assertTrue(torch.get_num_interop_threads() == 1)

    # Test default number of threads onnx.
    @unittest.skipIf(
        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS"
    )
    def test_onnx_deafault_n_threads(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        # Create ONNX-ML model
        onnx_ml_model = convert_sklearn(
            model, initial_types=[("input", FloatTensorType([X.shape[0], X.shape[1]]))], target_opset=9
        )

        hb_model = hummingbird.ml.convert(onnx_ml_model, "onnx", X)

        self.assertIsNotNone(hb_model)
        self.assertTrue(hb_model._session.get_session_options().intra_op_num_threads == psutil.cpu_count(logical=False))
        self.assertTrue(hb_model._session.get_session_options().inter_op_num_threads == 1)

    # Test one thread onnx.
    @unittest.skipIf(
        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS"
    )
    def test_onnx_one_thread(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        hb_model = hummingbird.ml.convert(model, "onnx", X, extra_config={constants.N_THREADS: 1})

        self.assertIsNotNone(hb_model)
        self.assertTrue(hb_model._session.get_session_options().intra_op_num_threads == 1)
        self.assertTrue(hb_model._session.get_session_options().inter_op_num_threads == 1)

    # Test pytorch regressor with batching.
    def test_torch_regression_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test pytorch classifier with batching.
    def test_torch_classification_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06)

    # Test pytorch classifier with batching.
    def test_torch_iforest_batch(self):
        warnings.filterwarnings("ignore")
        num_classes = 2
        model = IsolationForest(n_estimators=10, max_samples=2)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06)

    # Test pytorch regressor with batching and uneven rows.
    def test_torch_batch_regression_uneven(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(105, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=105)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test pytorch classification with batching and uneven rows.
    def test_torch_batch_classification_uneven(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(105, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=105)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test pytorch transform with batching and uneven rows.
    def test_torch_batch_transform(self):
        warnings.filterwarnings("ignore")
        model = StandardScaler(with_mean=True, with_std=True)
        np.random.seed(0)
        X = np.random.rand(105, 200)
        X = np.array(X, dtype=np.float32)

        model.fit(X)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06)

    # Test torchscript regression with batching.
    def test_torchscript_regression_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(103, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=103)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch.jit", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test torchscript classification with batching.
    def test_torchscript_classification_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(103, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=103)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch.jit", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06)

    # Test torchscript iforest with batching.
    def test_torchscript_iforest_batch(self):
        warnings.filterwarnings("ignore")
        num_classes = 2
        model = IsolationForest(n_estimators=10, max_samples=2)
        np.random.seed(0)
        X = np.random.rand(103, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=103)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch.jit", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06)

    # Test torchscript transform with batching and uneven rows.
    def test_torchscript_batch_transform(self):
        warnings.filterwarnings("ignore")
        model = StandardScaler(with_mean=True, with_std=True)
        np.random.seed(0)
        X = np.random.rand(101, 200)
        X = np.array(X, dtype=np.float32)

        model.fit(X)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "torch.jit", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06)

    # Test onnx transform with batching and uneven rows.
    @unittest.skipIf(
        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS"
    )
    def test_onnx_batch_transform(self):
        warnings.filterwarnings("ignore")
        model = StandardScaler(with_mean=True, with_std=True)
        np.random.seed(0)
        X = np.random.rand(101, 200)
        X = np.array(X, dtype=np.float32)

        model.fit(X)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "onnx", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06)

    # Test onnx regression with batching.
    @unittest.skipIf(
        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS"
    )
    def test_onnx_regression_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(103, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=103)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "onnx", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test onnx classification with batching.
    @unittest.skipIf(
        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS"
    )
    def test_onnx_classification_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(103, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=103)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "onnx", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06)

    # Test onnx iforest with batching.
    @unittest.skipIf(
        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS"
    )
    def test_onnx_iforest_batch(self):
        warnings.filterwarnings("ignore")
        num_classes = 2
        model = IsolationForest(n_estimators=10, max_samples=2)
        np.random.seed(0)
        X = np.random.rand(103, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=103)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "onnx", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06)

    # Test tvm transform with batching.
    @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM")
    def test_tvm_batch_transform(self):
        warnings.filterwarnings("ignore")
        model = StandardScaler(with_mean=True, with_std=True)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)

        model.fit(X)

        batch_size = 10
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :])

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06)

    # Test tvm regression with batching.
    @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM")
    def test_tvm_regression_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(103, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=103)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size=remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test tvm classification with batching.
    @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM")
    def test_tvm_classification_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        batch_size = 10
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :])

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06)

    # Test tvm iforest with batching.
    @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM")
    def test_tvm_iforest_batch(self):
        warnings.filterwarnings("ignore")
        num_classes = 2
        model = IsolationForest(n_estimators=10, max_samples=2)
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        batch_size = 10
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :])

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06)

    # Test tvm transform with batching and uneven numer of records.
    @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM")
    def test_tvm_batch_remainder_transform(self):
        warnings.filterwarnings("ignore")
        model = StandardScaler(with_mean=True, with_std=True)
        np.random.seed(0)
        X = np.random.rand(105, 200)
        X = np.array(X, dtype=np.float32)

        model.fit(X)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.transform(X), hb_model.transform(X), rtol=1e-06, atol=1e-06)

    # Test tvm regression with batching and uneven numer of records.
    @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM")
    def test_tvm_regression_remainder_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingRegressor(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(105, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=105)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test tvm classification with batching and uneven numer of records.
    @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM")
    def test_tvm_classification_remainder_batch(self):
        warnings.filterwarnings("ignore")
        max_depth = 10
        num_classes = 2
        model = GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)
        np.random.seed(0)
        X = np.random.rand(105, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=105)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-06, atol=1e-06)

    # Test tvm iforest with batching and uneven numer of records.
    @unittest.skipIf(not tvm_installed(), reason="TVM test require TVM")
    def test_tvm_iforest_remainder_batch(self):
        warnings.filterwarnings("ignore")
        num_classes = 2
        model = IsolationForest(n_estimators=10, max_samples=2)
        np.random.seed(0)
        X = np.random.rand(105, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=105)

        model.fit(X, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(model, "tvm", X[:batch_size, :], remainder_size)

        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.decision_function(X), hb_model.decision_function(X), rtol=1e-06, atol=1e-06)
        np.testing.assert_allclose(model.score_samples(X), hb_model.score_samples(X), rtol=1e-06, atol=1e-06)

    # Test batch with pandas.
    @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
    def test_pandas_batch(self):
        import pandas

        max_depth = 10
        iris = datasets.load_iris()
        X = iris.data[:149, :3]
        y = iris.target[:149]
        columns = ["vA", "vB", "vC"]
        X_train = pandas.DataFrame(X, columns=columns)

        pipeline = Pipeline(
            steps=[
                ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)),
                ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)),
            ]
        )

        pipeline.fit(X_train, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        torch_model = hummingbird.ml.convert_batch(
            pipeline, "torch", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size
        )

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            pipeline.predict_proba(X_train), torch_model.predict_proba(X_train), rtol=1e-06, atol=1e-06,
        )

    # Test batch with pandas ts.
    @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
    def test_pandas_batch_ts(self):
        import pandas

        max_depth = 10
        iris = datasets.load_iris()
        X = iris.data[:149, :3]
        y = iris.target[:149]
        columns = ["vA", "vB", "vC"]
        X_train = pandas.DataFrame(X, columns=columns)

        pipeline = Pipeline(
            steps=[
                ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)),
                ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)),
            ]
        )

        pipeline.fit(X_train, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        torch_model = hummingbird.ml.convert_batch(
            pipeline, "torch.jit", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size
        )

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            pipeline.predict_proba(X_train), torch_model.predict_proba(X_train), rtol=1e-06, atol=1e-06,
        )

    # Test batch with pandas onnx.
    @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
    @unittest.skipIf(not onnx_runtime_installed(), reason="ONNXML test require ONNX and ORT")
    def test_pandas_batch_onnx(self):
        import pandas

        max_depth = 10
        iris = datasets.load_iris()
        X = iris.data[:149, :3]
        y = iris.target[:149]
        columns = ["vA", "vB", "vC"]
        X_train = pandas.DataFrame(X, columns=columns)

        pipeline = Pipeline(
            steps=[
                ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)),
                ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)),
            ]
        )

        pipeline.fit(X_train, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(
            pipeline, "onnx", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size
        )

        self.assertTrue(hb_model is not None)

        np.testing.assert_allclose(
            pipeline.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-06, atol=1e-06,
        )

    # Test batch with pandas from onnxml.
    @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
    @unittest.skipIf(
        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS"
    )
    def test_pandas_batch_onnxml(self):
        import pandas

        max_depth = 10
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        columns = ["vA", "vB", "vC"]
        X_train = pandas.DataFrame(X, columns=columns)

        pipeline = Pipeline(
            steps=[
                ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)),
                ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)),
            ]
        )

        pipeline.fit(X_train, y)

        # Create ONNX-ML model
        onnx_ml_model = convert_sklearn(
            pipeline,
            initial_types=[
                ("vA", DoubleTensorType([X.shape[0], 1])),
                ("vB", DoubleTensorType([X.shape[0], 1])),
                ("vC", DoubleTensorType([X.shape[0], 1])),
            ],
            target_opset=9,
        )

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(
            onnx_ml_model, "onnx", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size
        )

        self.assertTrue(hb_model is not None)

        np.testing.assert_allclose(
            pipeline.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-06, atol=1e-06,
        )

    # Test batch with pandas tvm.
    @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
    @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM")
    def test_pandas_batch_tvm(self):
        import pandas

        max_depth = 10
        iris = datasets.load_iris()
        X = iris.data[:149, :3]
        y = iris.target[:149]
        columns = ["vA", "vB", "vC"]
        X_train = pandas.DataFrame(X, columns=columns)

        pipeline = Pipeline(
            steps=[
                ("preprocessor", ColumnTransformer(transformers=[], remainder="passthrough",)),
                ("classifier", GradientBoostingClassifier(n_estimators=10, max_depth=max_depth)),
            ]
        )

        pipeline.fit(X_train, y)

        batch_size = 10
        remainder_size = X.shape[0] % batch_size
        hb_model = hummingbird.ml.convert_batch(
            pipeline, "tvm", pandas.DataFrame(X[:batch_size], columns=columns), remainder_size
        )

        self.assertTrue(hb_model is not None)

        np.testing.assert_allclose(
            pipeline.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-06, atol=1e-06,
        )

    # Check converter with model name set as extra_config.
    @unittest.skipIf(
        not (onnx_ml_tools_installed() and onnx_runtime_installed()), reason="ONNXML test require ONNX, ORT and ONNXMLTOOLS"
    )
    @unittest.skipIf(not lightgbm_installed(), reason="LightGBM test requires LightGBM installed")
    def test_lightgbm_pytorch_extra_config(self):
        warnings.filterwarnings("ignore")
        X = [[0, 1], [1, 1], [2, 0]]
        X = np.array(X, dtype=np.float32)
        y = np.array([100, -10, 50], dtype=np.float32)
        model = lgb.LGBMRegressor(n_estimators=3, min_child_samples=1)
        model.fit(X, y)

        # Create ONNX-ML model
        onnx_ml_model = convert_lightgbm(
            model, initial_types=[("input", FloatTensorType([X.shape[0], X.shape[1]]))], target_opset=9
        )

        # Create ONNX model
        model_name = "hummingbird.ml.test.lightgbm"
        onnx_model = hummingbird.ml.convert(onnx_ml_model, "onnx", extra_config={constants.ONNX_OUTPUT_MODEL_NAME: model_name})

        assert onnx_model.model.graph.name == model_name

    # Test max fuse depth configuration in TVM.
    @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed")
    def test_tvm_max_fuse(self):
        warnings.filterwarnings("ignore")

        X = [[0, 1], [1, 1], [2, 0]]
        X = np.array(X, dtype=np.float32)
        y = np.array([100, -10, 50], dtype=np.float32)
        model = lgb.LGBMRegressor(n_estimators=3, min_child_samples=1)
        model.fit(X, y)

        hb_model = hummingbird.ml.convert(model, "tvm", X, extra_config={constants.TVM_MAX_FUSE_DEPTH: 30})
        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test TVM without padding returns an errror is sizes don't match.
    @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed")
    def test_tvm_no_padding(self):
        warnings.filterwarnings("ignore")

        np.random.seed(0)
        X = np.random.rand(100, 20)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(2, size=100)
        model = lgb.LGBMRegressor(n_estimators=10)
        model.fit(X, y)

        hb_model = hummingbird.ml.convert(model, "tvm", X)
        self.assertIsNotNone(hb_model)
        self.assertRaises(AssertionError, hb_model.predict, X[:98])

    # Test padding in TVM.
    @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed")
    def test_tvm_padding(self):
        warnings.filterwarnings("ignore")

        np.random.seed(0)
        X = np.random.rand(100, 20)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(2, size=100)
        model = lgb.LGBMRegressor(n_estimators=10)
        model.fit(X, y)

        hb_model = hummingbird.ml.convert(model, "tvm", X, extra_config={constants.TVM_PAD_INPUT: True})
        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X[:98]), hb_model.predict(X[:98]), rtol=1e-06, atol=1e-06)

    # Test padding in TVM does not create problems when not necessary.
    @unittest.skipIf(not tvm_installed(), reason="TVM test requires TVM installed")
    def test_tvm_padding_2(self):
        warnings.filterwarnings("ignore")

        X = [[0, 1], [1, 1], [2, 0]]
        X = np.array(X, dtype=np.float32)
        y = np.array([100, -10, 50], dtype=np.float32)
        model = lgb.LGBMRegressor(n_estimators=3, min_child_samples=1)
        model.fit(X, y)

        hb_model = hummingbird.ml.convert(model, "tvm", X, extra_config={constants.TVM_PAD_INPUT: True})
        self.assertIsNotNone(hb_model)
        np.testing.assert_allclose(model.predict(X), hb_model.predict(X), rtol=1e-06, atol=1e-06)

    # Test max string lentgh.
    def test_max_str_length(self):
        model = LabelEncoder()
        data = [
            "paris",
            "tokyo",
            "amsterdam",
            "tokyo",
        ]
        model.fit(data)

        torch_model = hummingbird.ml.convert(model, "torch", extra_config={constants.MAX_STRING_LENGTH: 20})

        np.testing.assert_allclose(model.transform(data), torch_model.transform(data), rtol=1e-06, atol=1e-06)
コード例 #5
0
class TestSparkMLPipeline(unittest.TestCase):
    @unittest.skipIf(not sparkml_installed(),
                     reason="Spark-ML test requires pyspark")
    @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"),
                     reason="Spark-ML test requires torch >= 1.6.0")
    def test_pipeline_1(self):
        n_features = 10
        n_total = 100
        classes = 2
        np.random.seed(0)
        warnings.filterwarnings("ignore")
        X = np.random.rand(n_total, n_features)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(classes, size=(n_total, 1))

        arr = np.concatenate([y, X], axis=1).reshape(n_total, -1)
        df = map(lambda x: (int(x[0]), Vectors.dense(x[1:])), arr)
        df = sql.createDataFrame(df, schema=["label", "features"])

        pipeline = Pipeline(stages=[LogisticRegression()])
        model = pipeline.fit(df)

        test_df = df.select("features").limit(1)
        torch_model = convert(model, "torch", test_df)
        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            np.array(model.transform(df).select(
                "prediction").collect()).reshape(-1),
            torch_model.predict(X),
            rtol=1e-06,
            atol=1e-06,
        )

        np.testing.assert_allclose(
            np.array(
                model.transform(df).select("probability").collect()).reshape(
                    -1, classes),
            torch_model.predict_proba(X),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"),
                     reason="Spark-ML test requires torch >= 1.6.0")
    @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()),
                     reason="Spark-ML test requires pyspark and pandas")
    def test_pipeline2(self):
        iris = load_iris()
        features = [
            "sepal_length", "sepal_width", "petal_length", "petal_width"
        ]

        pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]],
                             columns=features + ["label"])
        df = sql.createDataFrame(pd_df)

        quantile = QuantileDiscretizer(inputCol="sepal_length",
                                       outputCol="sepal_length_bucket",
                                       numBuckets=2)
        features = ["sepal_length_bucket"] + features
        assembler = VectorAssembler(inputCols=features, outputCol="features")
        pipeline = Pipeline(stages=[quantile, assembler, LogisticRegression()])
        model = pipeline.fit(df)

        df = df.select(
            ["sepal_length", "sepal_width", "petal_length", "petal_width"])
        pd_df = pd_df[[
            "sepal_length", "sepal_width", "petal_length", "petal_width"
        ]]
        torch_model = convert(model, "torch", df)
        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            np.array(model.transform(df).select(
                "prediction").collect()).reshape(-1),
            torch_model.predict(pd_df),
            rtol=1e-06,
            atol=1e-06,
        )

        np.testing.assert_allclose(
            np.array(
                model.transform(df).select("probability").collect()).reshape(
                    -1, 3),
            torch_model.predict_proba(pd_df),
            rtol=1e-06,
            atol=1e-05,
        )

    @unittest.skipIf((not sparkml_installed()) or (not pandas_installed()),
                     reason="Spark-ML test requires pyspark and pandas")
    @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"),
                     reason="Spark-ML test requires torch >= 1.6.0")
    def test_pipeline3(self):
        iris = load_iris()
        features = [
            "sepal_length", "sepal_width", "petal_length", "petal_width"
        ]

        pd_df = pd.DataFrame(data=np.c_[iris["data"], iris["target"]],
                             columns=features + ["label"])
        df = sql.createDataFrame(pd_df)

        quantile1 = QuantileDiscretizer(inputCol="sepal_length",
                                        outputCol="sepal_length_bucket",
                                        numBuckets=2)
        quantile2 = QuantileDiscretizer(inputCol="sepal_width",
                                        outputCol="sepal_width_bucket",
                                        numBuckets=2)
        features = ["sepal_length_bucket", "sepal_width_bucket"] + features
        assembler = VectorAssembler(inputCols=features, outputCol="features")
        pipeline = Pipeline(
            stages=[quantile1, quantile2, assembler,
                    LogisticRegression()])
        model = pipeline.fit(df)

        df = df.select(
            ["sepal_length", "sepal_width", "petal_length", "petal_width"])
        pd_df = pd_df[[
            "sepal_length", "sepal_width", "petal_length", "petal_width"
        ]]
        torch_model = convert(model, "torch", df)
        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            np.array(model.transform(df).select(
                "prediction").collect()).reshape(-1),
            torch_model.predict(pd_df),
            rtol=1e-06,
            atol=1e-06,
        )

        np.testing.assert_allclose(
            np.array(
                model.transform(df).select("probability").collect()).reshape(
                    -1, 3),
            torch_model.predict_proba(pd_df),
            rtol=1e-06,
            atol=1e-05,
        )
コード例 #6
0
from hummingbird.ml._utils import sparkml_installed, pandas_installed
from hummingbird.ml import convert
from distutils.version import LooseVersion

if sparkml_installed():
    from pyspark.sql import SparkSession, SQLContext
    from pyspark.ml import Pipeline
    from pyspark.ml.linalg import Vectors
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.feature import QuantileDiscretizer, VectorAssembler

    spark = SparkSession.builder.master("local[*]").config(
        "spark.driver.bindAddress", "127.0.0.1").getOrCreate()
    sql = SQLContext(spark)

if pandas_installed():
    import pandas as pd


class TestSparkMLPipeline(unittest.TestCase):
    @unittest.skipIf(not sparkml_installed(),
                     reason="Spark-ML test requires pyspark")
    @unittest.skipIf(LooseVersion(torch.__version__) < LooseVersion("1.6.0"),
                     reason="Spark-ML test requires torch >= 1.6.0")
    def test_pipeline_1(self):
        n_features = 10
        n_total = 100
        classes = 2
        np.random.seed(0)
        warnings.filterwarnings("ignore")
        X = np.random.rand(n_total, n_features)
コード例 #7
0
ファイル: test_prophet.py プロジェクト: TuanBC/hummingbird
class TestProphet(unittest.TestCase):
    def _get_data(self):
        local_path = "tests/resources"
        local_data = os.path.join(local_path,
                                  "example_wp_log_peyton_manning.csv")
        url = "https://raw.githubusercontent.com/facebook/prophet/master/examples/example_wp_log_peyton_manning.csv"
        if not os.path.isfile(local_data):
            os.makedirs(local_path)
            urlretrieve(url, local_data)
        data = pd.read_csv(local_data)
        return data

    @unittest.skipIf(not (pandas_installed() and prophet_installed()),
                     reason="Test requires Prophet and Pandas")
    def test_prophet_trend(self):
        df = self._get_data()

        m = Prophet()
        m.fit(df)

        # Convert with Hummingbird.
        hb_model = hummingbird.ml.convert(m, "torch")

        # Predictions.
        future = m.make_future_dataframe(periods=365)
        prophet_trend = m.predict(future)["trend"].values
        hb_trend = hb_model.predict(future)

        np.testing.assert_allclose(prophet_trend,
                                   hb_trend,
                                   rtol=1e-06,
                                   atol=1e-06)

    @unittest.skipIf(
        not (pandas_installed() and prophet_installed()),
        reason="Test requires Prophet, Pandas and ONNX runtime.",
    )
    @unittest.skipIf(
        LooseVersion(torch.__version__) < LooseVersion("1.8.1"),
        reason="Test requires Torch 1.8.1.",
    )
    @unittest.skipIf(
        not onnx_runtime_installed()
        or LooseVersion(onnxruntime.__version__) < LooseVersion("1.7.0"),
        reason="Prophet test requires onnxruntime => 1.7.0",
    )
    def test_prophet_trend_onnx(self):
        df = self._get_data()

        m = Prophet()
        m.fit(df)

        future = m.make_future_dataframe(periods=365)
        future_np = (future.values -
                     np.datetime64("1970-01-01T00:00:00.000000000")).astype(
                         np.int64) / 1000000000

        # Convert with Hummingbird.
        hb_model = hummingbird.ml.convert(m, "onnx", future_np)

        # Predictions.
        prophet_trend = m.predict(future)["trend"]
        hb_trend = hb_model.predict(future_np)
        import onnx

        onnx.save(hb_model.model, "prophet.onnx")

        np.testing.assert_allclose(prophet_trend,
                                   hb_trend,
                                   rtol=1e-06,
                                   atol=1e-06)
コード例 #8
0
class TestSklearnPipeline(unittest.TestCase):
    def test_pipeline(self):
        data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]], dtype=np.float32)
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_pipeline2(self):
        data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]],
                        dtype=np.float32)
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_combine_inputs_union_in_pipeline(self):
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline

        data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]],
                        dtype=np.float32)
        model = Pipeline([
            ("scaler1", StandardScaler()),
            ("union",
             FeatureUnion([("scaler2", StandardScaler()),
                           ("scaler3", MinMaxScaler())])),
        ])
        model.fit(data)

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_combine_inputs_floats_ints(self):
        data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]]
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_1(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        preprocessor = ColumnTransformer(transformers=[("num",
                                                        numeric_transformer,
                                                        numeric_features)])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_weights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_drop(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="drop",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_drop_noweights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            remainder="drop",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough_noweights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough_slice(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = slice(0, 1)  # ["vA", "vB"]
        categorical_features = slice(3, 4)  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not onnx_runtime_installed(),
                     reason="Test requires ORT installed")
    def test_pipeline_many_inputs(self):
        n_features = 18
        X = np.random.rand(100, n_features)
        y = np.random.randint(1000, size=100)

        scaler_transformer = Pipeline(steps=[("scaler", StandardScaler())])
        preprocessor = ColumnTransformer(
            transformers=[("scaling", scaler_transformer,
                           list(range(n_features)))])
        model = RandomForestRegressor(n_estimators=10, max_depth=9)
        pipeline = Pipeline(steps=[("preprocessor",
                                    preprocessor), ("model", model)])

        pipeline.fit(X, y)

        X_test = tuple(np.split(X, n_features, axis=1))

        hb_model = hummingbird.ml.convert(pipeline, "onnx", X_test)

        assert len(hb_model.model.graph.input) == n_features

        np.testing.assert_allclose(
            pipeline.predict(X),
            np.array(hb_model.predict(X_test)).flatten(),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not onnx_runtime_installed(),
                     reason="Test requires ORT installed")
    def test_pipeline_many_inputs_with_schema(self):
        n_features = 5
        X = np.random.rand(100, n_features)
        y = np.random.randint(1000, size=100)
        input_column_names = ["A", "B", "C", "D", "E"]
        output_column_names = ["score"]

        scaler_transformer = Pipeline(steps=[("scaler", StandardScaler())])
        preprocessor = ColumnTransformer(
            transformers=[("scaling", scaler_transformer,
                           list(range(n_features)))])
        model = RandomForestRegressor(n_estimators=10, max_depth=9)
        pipeline = Pipeline(steps=[("preprocessor",
                                    preprocessor), ("model", model)])

        pipeline.fit(X, y)

        X_test = tuple(np.split(X, n_features, axis=1))
        extra_config = {
            constants.INPUT_NAMES: input_column_names,
            constants.OUTPUT_NAMES: output_column_names
        }

        hb_model = hummingbird.ml.convert(pipeline,
                                          "onnx",
                                          X_test,
                                          extra_config=extra_config)

        graph_inputs = [input.name for input in hb_model.model.graph.input]
        graph_outputs = [output.name for output in hb_model.model.graph.output]

        assert len(hb_model.model.graph.input) == n_features
        assert graph_inputs == input_column_names
        assert graph_outputs == output_column_names
コード例 #9
0
class TestSklearnLinearClassifiers(unittest.TestCase):

    # LogisticRegression test function to be parameterized
    def _test_logistic_regression(self, num_classes, solver="liblinear", multi_class="auto", labels_shift=0):
        if num_classes > 2:
            model = LogisticRegression(solver=solver, multi_class=multi_class, fit_intercept=True)
        else:
            model = LogisticRegression(solver="liblinear", fit_intercept=True)

        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100) + labels_shift

        model.fit(X, y)

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-6, atol=1e-6)

    # LogisticRegression binary
    def test_logistic_regression_bi(self):
        self._test_logistic_regression(2)

    # LogisticRegression multiclass with auto
    def test_logistic_regression_multi_auto(self):
        self._test_logistic_regression(3)

    # LogisticRegression with class labels shifted
    def test_logistic_regression_shifted_classes(self):
        self._test_logistic_regression(3, labels_shift=2)

    # LogisticRegression with multi+ovr
    def test_logistic_regression_multi_ovr(self):
        self._test_logistic_regression(3, multi_class="ovr")

    # LogisticRegression with multi+multinomial+sag
    def test_logistic_regression_multi_multin_sag(self):
        warnings.filterwarnings("ignore")
        # this will not converge due to small test size
        self._test_logistic_regression(3, multi_class="multinomial", solver="sag")

    # LogisticRegression binary lbfgs
    def test_logistic_regression_bi_lbfgs(self):
        warnings.filterwarnings("ignore")
        # this will not converge due to small test size
        self._test_logistic_regression(2, solver="lbfgs")

    # LogisticRegression with multi+lbfgs
    def test_logistic_regression_multi_lbfgs(self):
        warnings.filterwarnings("ignore")
        # this will not converge due to small test size
        self._test_logistic_regression(3, solver="lbfgs")

    # LogisticRegression with multi+multinomial+lbfgs
    def test_logistic_regression_multi_multin_lbfgs(self):
        warnings.filterwarnings("ignore")
        # this will not converge due to small test size
        self._test_logistic_regression(3, multi_class="multinomial", solver="lbfgs")

    # LogisticRegression with multi+ovr+lbfgs
    def test_logistic_regression_multi_ovr_lbfgs(self):
        warnings.filterwarnings("ignore")
        # this will not converge due to small test size
        self._test_logistic_regression(3, multi_class="ovr", solver="lbfgs")

    # LinearRegression test function to be parameterized
    def _test_linear_regression(self, y_input):
        model = LinearRegression()

        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = y_input

        model.fit(X, y)

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6)

    # LinearRegression with ints
    def test_linear_regression_int(self):
        np.random.seed(0)
        self._test_linear_regression(np.random.randint(2, size=100))

    # LinearRegression with floats
    def test_linear_regression_float(self):
        np.random.seed(0)
        self._test_linear_regression(np.random.rand(100))

    # RidgeCV test function to be parameterized
    def _test_ridge_cv(self, y_input):
        model = RidgeCV()

        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = y_input

        model.fit(X, y)

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6)

    # RidgeCV with ints
    def test_ridge_cv_int(self):
        np.random.seed(0)
        self._test_ridge_cv(np.random.randint(2, size=100))

    # RidgeCV with floats
    def test_ridge_cv_float(self):
        np.random.seed(0)
        self._test_ridge_cv(np.random.rand(100))

    # LogisticRegressionCV test function to be parameterized
    def _test_logistic_regression_cv(self, num_classes, solver="liblinear", multi_class="auto", labels_shift=0):
        if num_classes > 2:
            model = LogisticRegressionCV(solver=solver, multi_class=multi_class, fit_intercept=True)
        else:
            model = LogisticRegressionCV(solver="liblinear", fit_intercept=True)

        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100) + labels_shift

        model.fit(X, y)
        torch_model = hummingbird.ml.convert(model, "torch")
        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-6, atol=1e-6)

    # LogisticRegressionCV with 2 classes
    def test_logistic_regression_cv_bi(self):
        self._test_logistic_regression_cv(2)

    # LogisticRegressionCV with 3 classes
    def test_logistic_regression_cv_multi(self):
        self._test_logistic_regression_cv(3)

    # LogisticRegressionCV with shifted classes
    def test_logistic_regression_cv_shifted_classes(self):
        self._test_logistic_regression_cv(3, labels_shift=2)

    # LogisticRegressionCV with multi+ovr
    def test_logistic_regression_cv_multi_ovr(self):
        self._test_logistic_regression_cv(3, multi_class="ovr")

    # LogisticRegressionCV with multi+multinomial
    def test_logistic_regression_cv_multi_multin(self):
        warnings.filterwarnings("ignore")
        # this will not converge due to small test size
        self._test_logistic_regression_cv(3, multi_class="multinomial", solver="sag")

    # SGDClassifier test function to be parameterized
    def _test_sgd_classifier(self, num_classes):

        model = SGDClassifier(loss="log")

        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        torch_model = hummingbird.ml.convert(model, "torch")
        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(model.predict_proba(X), torch_model.predict_proba(X), rtol=1e-6, atol=1e-6)

    # SGDClassifier with 2 classes
    def test_sgd_classifier_bi(self):
        self._test_sgd_classifier(2)

    # SGDClassifier with 3 classes
    def test_sgd_classifier_multi(self):
        self._test_sgd_classifier(3)

    # SGDClassifier with modified huber loss
    @unittest.skipIf(
        LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Modified Huber loss test requires torch >= 1.6.0"
    )
    def test_modified_huber(self):
        X = np.array([[-0.5, -1], [-1, -1], [-0.1, -0.1], [0.1, -0.2], [0.5, 1], [1, 1], [0.1, 0.1], [-0.1, 0.2]])
        Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])

        model = SGDClassifier(loss="modified_huber", max_iter=1000, tol=1e-3)
        model.fit(X, Y)

        # Use Hummingbird to convert the model to PyTorch
        hb_model = hummingbird.ml.convert(model, "torch")

        inputs = [[-1, -1], [1, 1], [-0.2, 0.1], [0.2, -0.1]]
        np.testing.assert_allclose(model.predict_proba(inputs), hb_model.predict_proba(inputs), rtol=1e-6, atol=1e-6)

    @unittest.skipIf(
        LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Modified Huber loss test requires torch >= 1.6.0"
    )
    def test_modified_huber2(self):
        X = np.array([[-0.5, -1], [-1, -1], [-0.1, -0.1], [0.1, -0.2], [0.5, 1], [1, 1], [0.1, 0.1], [-0.1, 0.2]])
        Y = np.array([1, 1, 1, 1, 2, 2, 2, 2])

        model = SGDClassifier(loss="modified_huber", max_iter=1000, tol=1e-3)
        model.fit(X, Y)

        # Use Hummingbird to convert the model to PyTorch
        hb_model = hummingbird.ml.convert(model, "torch")

        np.testing.assert_allclose(model.predict_proba(X), hb_model.predict_proba(X), rtol=1e-6, atol=1e-6)

    # SGDClassifier with modified huber loss multiclass
    @unittest.skipIf(
        LooseVersion(torch.__version__) < LooseVersion("1.6.0"), reason="Modified Huber loss test requires torch >= 1.6.0"
    )
    def test_modified_huber_multi(self):
        X = np.array([[-0.5, -1], [-1, -1], [-0.1, -0.1], [0.1, -0.2], [0.5, 1], [1, 1], [0.1, 0.1], [-0.1, 0.2]])
        Y = np.array([0, 1, 1, 1, 2, 2, 2, 2])

        model = SGDClassifier(loss="modified_huber", max_iter=1000, tol=1e-3)
        model.fit(X, Y)

        # Use Hummingbird to convert the model to PyTorch
        hb_model = hummingbird.ml.convert(model, "torch")

        inputs = [[-1, -1], [1, 1], [-0.2, 0.1], [0.2, -0.1]]
        np.testing.assert_allclose(model.predict_proba(inputs), hb_model.predict_proba(inputs), rtol=1e-6, atol=1e-6)

    # Failure cases
    def test_sklearn_linear_model_raises_wrong_type(self):
        warnings.filterwarnings("ignore")
        np.random.seed(0)
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(3, size=100).astype(np.float32)  # y must be int, not float, should error
        model = SGDClassifier().fit(X, y)
        self.assertRaises(RuntimeError, hummingbird.ml.convert, model, "torch")

    # Float 64 data tests
    def test_float64_linear_regression(self):
        model = LinearRegression()

        np.random.seed(0)
        X = np.random.rand(100, 200)
        y = np.random.randint(2, size=100)

        model.fit(X, y)

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6)

    def test_float64_sgd_classifier(self):

        model = SGDClassifier(loss="log")

        np.random.seed(0)
        num_classes = 3
        X = np.random.rand(100, 200)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        torch_model = hummingbird.ml.convert(model, "torch")
        self.assertTrue(torch_model is not None)
        np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6)

    # Multioutput regression tests
    def test_multioutput_linear_regression(self):
        for n_targets in [1, 2, 7]:
            model = LinearRegression()
            X, y = datasets.make_regression(
                n_samples=100, n_features=10, n_informative=5, n_targets=n_targets, random_state=2021
            )
            model.fit(X, y)

            torch_model = hummingbird.ml.convert(model, "torch")
            self.assertTrue(torch_model is not None)
            np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-5, atol=1e-5)

    # Test Pandas input
    @unittest.skipIf(not pandas_installed(), reason="Test requires pandas installed")
    def test_logistic_regression_pandas(self):
        model = LogisticRegression(solver="liblinear")

        data = datasets.load_iris()
        X, y = data.data[:, :3], data.target
        X = X.astype(np.float32)
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2

        model.fit(X_train, y_train)

        hb_model = hummingbird.ml.convert(model, "torch")
        self.assertTrue(hb_model is not None)
        np.testing.assert_allclose(model.predict(X_train), hb_model.predict(X_train), rtol=1e-6, atol=1e-6)
        np.testing.assert_allclose(model.predict_proba(X_train), hb_model.predict_proba(X_train), rtol=1e-6, atol=1e-6)

    # Test Torschscript backend.
    def test_logistic_regression_ts(self):

        model = LogisticRegression(solver="liblinear")

        data = datasets.load_iris()
        X, y = data.data, data.target
        X = X.astype(np.float32)

        model.fit(X, y)

        ts_model = hummingbird.ml.convert(model, "torch.jit", X)
        self.assertTrue(ts_model is not None)
        np.testing.assert_allclose(model.predict(X), ts_model.predict(X), rtol=1e-6, atol=1e-6)
        np.testing.assert_allclose(model.predict_proba(X), ts_model.predict_proba(X), rtol=1e-6, atol=1e-6)

    # Test TVM backends.
    @unittest.skipIf(not (tvm_installed()), reason="TVM tests require TVM")
    def test_sgd_classifier_tvm(self):

        model = SGDClassifier(loss="log")

        np.random.seed(0)
        num_classes = 3
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        tvm_model = hummingbird.ml.convert(model, "tvm", X)
        self.assertTrue(tvm_model is not None)
        np.testing.assert_allclose(model.predict(X), tvm_model.predict(X), rtol=1e-6, atol=1e-6)
        np.testing.assert_allclose(model.predict_proba(X), tvm_model.predict_proba(X), rtol=1e-6, atol=1e-6)

    @unittest.skipIf(not (tvm_installed()), reason="TVM tests require TVM")
    def test_lr_tvm(self):

        model = LinearRegression()

        np.random.seed(0)
        num_classes = 1000
        X = np.random.rand(100, 200)
        X = np.array(X, dtype=np.float32)
        y = np.random.randint(num_classes, size=100)

        model.fit(X, y)

        tvm_model = hummingbird.ml.convert(model, "tvm", X, extra_config={constants.TVM_MAX_FUSE_DEPTH: 30})
        self.assertTrue(tvm_model is not None)

        np.testing.assert_allclose(model.predict(X), tvm_model.predict(X), rtol=1e-6, atol=1e-3)
コード例 #10
0
class TestSklearnPipeline(unittest.TestCase):
    def test_pipeline(self):
        data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]], dtype=np.float32)
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_pipeline2(self):
        data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]],
                        dtype=np.float32)
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_combine_inputs_union_in_pipeline(self):
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline

        data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]],
                        dtype=np.float32)
        model = Pipeline([
            ("scaler1", StandardScaler()),
            ("union",
             FeatureUnion([("scaler2", StandardScaler()),
                           ("scaler3", MinMaxScaler())])),
        ])
        model.fit(data)

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_combine_inputs_floats_ints(self):
        data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]]
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_1(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        preprocessor = ColumnTransformer(transformers=[("num",
                                                        numeric_transformer,
                                                        numeric_features)])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_string(self):
        """
        TODO: Hummingbird does not yet support strings in this context. Should raise error.
        When this feature is complete, change this test.
        """
        # fit
        titanic_url = "https://raw.githubusercontent.com/amueller/scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv"
        data = pandas.read_csv(titanic_url)
        X = data.drop("survived", axis=1)
        y = data["survived"]
        # SimpleImputer on string is not available for string
        # in ONNX-ML specifications.
        # So we do it beforehand.
        X["pclass"].fillna("missing", inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)

        numeric_features = ["age", "fare"]
        numeric_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(
                strategy="median")), ("scaler", StandardScaler())])

        categorical_features = ["pclass"]
        categorical_transformer = Pipeline(
            steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        clf = Pipeline(
            steps=[("preprocessor", preprocessor
                    ), ("classifier", LogisticRegression(solver="liblinear"))])

        to_drop = {
            "parch", "sibsp", "cabin", "ticket", "name", "body", "home.dest",
            "boat", "sex", "embarked"
        }

        X_train = X_train.copy()
        X_test = X_test.copy()
        X_train["pclass"] = X_train["pclass"].astype(np.int64)
        X_test["pclass"] = X_test["pclass"].astype(np.int64)
        X_train = X_train.drop(to_drop, axis=1)
        X_test = X_test.drop(to_drop, axis=1)

        clf.fit(X_train, y_train)

        torch_model = hummingbird.ml.convert(clf, "torch", X_test)

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            clf.predict(X_test),
            torch_model.predict(X_test),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_pandas(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch", X_test)

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_pandas_ts(self):
        iris = datasets.load_iris()
        X = np.array(
            iris.data[:, :3], np.float32
        )  # If we don't use float32 here, with python 3.5 and torch 1.5.1 will fail.
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("preprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch.jit", X_test)

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_weights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
        )

        model = Pipeline(steps=[("preprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_weights_pandas(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch", X_test)

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_drop(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="drop",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_drop_noweights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            remainder="drop",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough_noweights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough_slice(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = slice(0, 1)  # ["vA", "vB"]
        categorical_features = slice(3, 4)  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    # Taken from https://github.com/microsoft/hummingbird/issues/388https://github.com/microsoft/hummingbird/issues/388
    def test_pipeline_pca_rf(self):
        X, y = make_regression(n_samples=1000,
                               n_features=8,
                               n_informative=5,
                               n_targets=1,
                               random_state=0,
                               shuffle=True)
        pca = PCA(n_components=8, svd_solver="randomized", whiten=True)
        clf = make_pipeline(
            StandardScaler(), pca,
            RandomForestRegressor(n_estimators=10,
                                  max_depth=30,
                                  random_state=0))
        clf.fit(X, y)

        model = hummingbird.ml.convert(clf, "pytorch")

        prediction_sk = clf.predict([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])

        prediction_hb = model.predict(
            [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])

        np.testing.assert_allclose(prediction_sk,
                                   prediction_hb,
                                   rtol=1e-06,
                                   atol=1e-06)

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not onnx_runtime_installed(),
                     reason="Test requires ORT installed")
    def test_pipeline_many_inputs(self):
        n_features = 18
        X = np.random.rand(100, n_features)
        y = np.random.randint(1000, size=100)

        scaler_transformer = Pipeline(steps=[("scaler", StandardScaler())])
        preprocessor = ColumnTransformer(
            transformers=[("scaling", scaler_transformer,
                           list(range(n_features)))])
        model = RandomForestRegressor(n_estimators=10, max_depth=9)
        pipeline = Pipeline(steps=[("preprocessor",
                                    preprocessor), ("model", model)])

        pipeline.fit(X, y)

        X_test = tuple(np.split(X, n_features, axis=1))

        hb_model = hummingbird.ml.convert(pipeline, "onnx", X_test)

        assert len(hb_model.model.graph.input) == n_features

        np.testing.assert_allclose(
            pipeline.predict(X),
            np.array(hb_model.predict(X_test)).flatten(),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not onnx_runtime_installed(),
                     reason="Test requires ORT installed")
    def test_pipeline_many_inputs_with_schema(self):
        n_features = 5
        X = np.random.rand(100, n_features)
        y = np.random.randint(1000, size=100)
        input_column_names = ["A", "B", "C", "D", "E"]
        output_column_names = ["score"]

        scaler_transformer = Pipeline(steps=[("scaler", StandardScaler())])
        preprocessor = ColumnTransformer(
            transformers=[("scaling", scaler_transformer,
                           list(range(n_features)))])
        model = RandomForestRegressor(n_estimators=10, max_depth=9)
        pipeline = Pipeline(steps=[("preprocessor",
                                    preprocessor), ("model", model)])

        pipeline.fit(X, y)

        X_test = tuple(np.split(X, n_features, axis=1))
        extra_config = {
            constants.INPUT_NAMES: input_column_names,
            constants.OUTPUT_NAMES: output_column_names
        }

        hb_model = hummingbird.ml.convert(pipeline,
                                          "onnx",
                                          X_test,
                                          extra_config=extra_config)

        graph_inputs = [input.name for input in hb_model.model.graph.input]
        graph_outputs = [output.name for output in hb_model.model.graph.output]

        assert len(hb_model.model.graph.input) == n_features
        assert graph_inputs == input_column_names
        assert graph_outputs == output_column_names
コード例 #11
0
class TestSklearnPipeline(unittest.TestCase):
    def test_pipeline(self):
        data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]], dtype=np.float32)
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_pipeline2(self):
        data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]],
                        dtype=np.float32)
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_combine_inputs_union_in_pipeline(self):
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline

        data = np.array([[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]],
                        dtype=np.float32)
        model = Pipeline([
            ("scaler1", StandardScaler()),
            ("union",
             FeatureUnion([("scaler2", StandardScaler()),
                           ("scaler3", MinMaxScaler())])),
        ])
        model.fit(data)

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    def test_combine_inputs_floats_ints(self):
        data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]]
        scaler = StandardScaler()
        scaler.fit(data)
        model = Pipeline([("scaler1", scaler), ("scaler2", scaler)])

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.transform(data),
            torch_model.transform(data),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_1(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        preprocessor = ColumnTransformer(transformers=[("num",
                                                        numeric_transformer,
                                                        numeric_features)])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_weights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_drop(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="drop",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_drop_noweights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            remainder="drop",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough_noweights(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = [0, 1]  # ["vA", "vB"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )

    @unittest.skipIf(ColumnTransformer is None,
                     reason="ColumnTransformer not available in 0.19")
    @unittest.skipIf(not pandas_installed(),
                     reason="Test requires pandas installed")
    def test_pipeline_column_transformer_passthrough_slice(self):
        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: 1 if x > 0.5 else 2)
        X_train["vcat2"] = X_train["vB"].apply(lambda x: 3 if x > 0.5 else 4)
        y_train = y % 2
        numeric_features = slice(0, 1)  # ["vA", "vB"]
        categorical_features = slice(3, 4)  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="liblinear",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

        categorical_transformer = Pipeline(
            steps=[("onehot",
                    OneHotEncoder(sparse=True, handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[
                ("num", numeric_transformer, numeric_features),
                ("cat", categorical_transformer, categorical_features),
            ],
            transformer_weights={
                "num": 2,
                "cat": 3
            },
            remainder="passthrough",
        )

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)

        X_test = X_train[:11]

        torch_model = hummingbird.ml.convert(model, "torch")

        self.assertTrue(torch_model is not None)

        np.testing.assert_allclose(
            model.predict_proba(X_test),
            torch_model.predict_proba(X_test.values),
            rtol=1e-06,
            atol=1e-06,
        )