Example #1
0
    def test_dct(self):
        data = self.spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]), )],
                                          ["vec"])
        model = DCT(inverse=False, inputCol="vec", outputCol="resultVec")
        # the input name should match that of what inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(
            model, 'Sparkml DCT',
            [('vec', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().resultVec.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().vec.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlDCT")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['resultVec'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #2
0
    def test_model_normalizer_2(self):
        data = self.spark.createDataFrame([(0, Vectors.dense(1.0, 0.5, -1.0)),
                                           (1, Vectors.dense(2.0, 1.0, 1.0)),
                                           (2, Vectors.dense(4.0, 10.0, 2.0))
                                           ]).toDF("id", "features")
        model = Normalizer(inputCol='features',
                           outputCol='norm_feature',
                           p=2.0)

        model_onnx = convert_sparkml(model, 'Sparkml Normalizer',
                                     [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().norm_feature.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlNormalizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['norm_feature'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
 def test_gbt_classifier(self):
     raw_data = self.spark.createDataFrame(
         [(1.0, Vectors.dense(1.0)),
          (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     string_indexer = StringIndexer(inputCol="label", outputCol="indexed")
     si_model = string_indexer.fit(raw_data)
     data = si_model.transform(raw_data)
     gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml GBT Classifier',
         [('features', FloatTensorType([None, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(
             lambda x: pandas.Series(x.toArray())).values.astype(
                 numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlGBTClassifier")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction', 'probability'],
                                            data_np, onnx_model_path)
     compare_results(expected, output, decimal=5)
Example #4
0
    def test_stop_words_remover(self):
        data = self.spark.createDataFrame([(["a", "b", "c"], )], ["text"])
        model = StopWordsRemover(inputCol="text",
                                 outputCol="words",
                                 stopWords=["b"])

        feature_count = len(data.columns)
        model_onnx = convert_sparkml(
            model, 'Sparkml StopWordsRemover',
            [('text', StringTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().words.values
        data_np = data.toPandas().text.values
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlStopWordsRemover")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
 def test_model_logistic_regression_binary_class(self):
     import inspect
     import os
     this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
     input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
     original_data = self.spark.read.format("libsvm").load(input_path)
     #
     # truncate the features
     #
     self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]),
                             VectorUDT())
     data = original_data.selectExpr("label", "truncateFeatures(features) as features")
     lr = LogisticRegression(maxIter=100, tol=0.0001)
     model = lr.fit(data)
     # the name of the input for Logistic Regression is 'features'
     model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, model.numFeatures]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     import pandas
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     ]
     dump_data_and_sparkml_model(data_np, expected, model, model_onnx,
                                 basename="SparkmlLogisticRegression")
 def test_model_vector_assembler(self):
     col_names = ["a", "b", "c"]
     model = VectorAssembler(inputCols=col_names, outputCol='features')
     data = self.spark.createDataFrame([(1., 0., 3.)], col_names)
     model_onnx = convert_sparkml(model, 'Sparkml VectorAssembler',
                                  [('a', FloatTensorType([None, 1])),
                                   ('b', FloatTensorType([None, 1])),
                                   ('c', FloatTensorType([None, 1]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     predicted = model.transform(data)
     expected = predicted.select("features").toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values
     data_np = {
         'a': data.select('a').toPandas().values.astype(numpy.float32),
         'b': data.select('b').toPandas().values.astype(numpy.float32),
         'c': data.select('c').toPandas().values.astype(numpy.float32)
     }
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlVectorAssembler")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['features'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Example #7
0
    def test_model_onehot_encoder(self):
        encoder = OneHotEncoderEstimator(inputCols=['index'],
                                         outputCols=['indexVec'])
        data = self.spark.createDataFrame([(0.0, ), (1.0, ), (2.0, ), (2.0, ),
                                           (0.0, ), (2.0, )], ['index'])
        model = encoder.fit(data)
        model_onnx = convert_sparkml(model, 'Sparkml OneHotEncoder',
                                     [('index', FloatTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.select("index").toPandas().values.astype(numpy.float32)
        predicted_np = predicted.select("indexVec").toPandas().indexVec.apply(
            lambda x: x.toArray().tolist()).values
        expected = numpy.asarray(
            [x + [0] if numpy.amax(x) == 1 else x + [1] for x in predicted_np])

        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlOneHotEncoder")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexVec'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
    def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([1.2, 3.2, 1.3, -5.6]), ),
             (Vectors.dense([4.3, -3.2, 5.7, 1.0]), ),
             (Vectors.dense([0, 3.2, 4.7, -8.9]), )], ["dense"])
        model = PolynomialExpansion(degree=2,
                                    inputCol="dense",
                                    outputCol="expanded")

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml PolynomialExpansion',
            [('dense', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().expanded.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().dense.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPolynomialExpansion")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['expanded'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #9
0
    def test_vector_slicer(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ),
             (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ),
             (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"])
        model = VectorSlicer(inputCol="features",
                             outputCol="sliced",
                             indices=[1, 4])

        feature_count = data.first()[0].array.size
        model_onnx = convert_sparkml(
            model, 'Sparkml VectorSlicer',
            [('features', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().sliced.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlVectorSlicer")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['sliced'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #10
0
 def test_model_vector_indexer_single(self):
     vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed")
     data = self.spark.createDataFrame([(Vectors.dense([-1.0]), ),
                                        (Vectors.dense([0.0]), ),
                                        (Vectors.dense([0.0]), )], ["a"])
     model = vi.fit(data)
     model_onnx = convert_sparkml(
         model,
         'Sparkml VectorIndexer Single',
         [('a', FloatTensorType([None, model.numFeatures]))],
         target_opset=9)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     expected = predicted.toPandas().indexed.apply(
         lambda x: pandas.Series(x.toArray())).values
     data_np = data.toPandas().a.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlVectorIndexerSingle")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['indexed'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
    def test_chi_sq_selector(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
             (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
             (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)],
            ["features", "label"])
        selector = ChiSqSelector(numTopFeatures=1,
                                 outputCol="selectedFeatures")
        model = selector.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml ChiSqSelector',
            [('features', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().selectedFeatures.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlChiSqSelector")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['selectedFeatures'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
 def test_tree_one_class_classification(self):
     features = [[0., 1.], [1., 1.], [2., 0.]]
     features = numpy.array(features, dtype=numpy.float32)
     labels = [1, 1, 1]
     dd = [(labels[i], Vectors.dense(features[i]))
           for i in range(len(labels))]
     data = self.spark.createDataFrame(
         self.spark.sparkContext.parallelize(dd),
         schema=["label", "features"])
     dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
     model = dt.fit(data)
     model_onnx = convert_sparkml(
         model,
         'Sparkml Decision Tree One Class',
         [('features', FloatTensorType([None, 2]))],
         spark_session=self.spark)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     predicted = model.transform(data)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(
             lambda x: pandas.Series(x.toArray())).values.astype(
                 numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlDecisionTreeBinaryClass")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction', 'probability'],
                                            data_np, onnx_model_path)
     compare_results(expected, output, decimal=5)
Example #13
0
    def test_index_to_string(self):
        original_data = self.spark.createDataFrame([(0, "a"), (1, "b"),
                                                    (2, "c"), (3, "a"),
                                                    (4, "a"), (5, "c")],
                                                   ["id", "category"])
        string_indexer = StringIndexer(inputCol="category",
                                       outputCol="categoryIndex")
        string_indexer_model = string_indexer.fit(original_data)
        data = string_indexer_model.transform(original_data)

        model = IndexToString(inputCol="categoryIndex",
                              outputCol="originalCategory",
                              labels=['A', 'B', 'C'])
        # the input name should match that of what IndexToString.inputCol
        model_onnx = convert_sparkml(
            model, 'Sparkml IndexToString',
            [('categoryIndex', Int64TensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        expected = predicted.select("originalCategory").toPandas().values
        data_np = data.select('categoryIndex').toPandas().values.astype(
            numpy.int64)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlIndexToString")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['originalCategory'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
 def test_model_string_indexer(self):
     indexer = StringIndexer(inputCol='cat1',
                             outputCol='cat1_index',
                             handleInvalid='skip')
     data = self.spark.createDataFrame([("a", ), ("b", ), ("c", ), ("a", ),
                                        ("a", ), ("c", )], ['cat1'])
     model = indexer.fit(data)
     # the input name should match that of what StringIndexer.inputCol
     model_onnx = convert_sparkml(model, 'Sparkml StringIndexer',
                                  [('cat1', StringTensorType([1, 1]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     predicted = model.transform(data)
     expected = predicted.select("cat1_index").toPandas().values
     data_np = data.select('cat1').toPandas().values
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlStringIndexer")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['cat1_index'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
    def test_model_generalized_linear_regression(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_linear_regression_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)

        lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
        model = lr.fit(data)
        # the name of the input is 'features'
        C = model.numFeatures
        model_onnx = convert_sparkml(
            model, 'sparkml GeneralizedLinearRegression',
            [('features', FloatTensorType([None, C]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlGeneralizedLinearRegression")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['prediction'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
 def test_model_linear_regression_basic(self):
     data = self.spark.createDataFrame(
         [(1.0, 2.0, Vectors.dense(1.0)),
          (0.0, 2.0, Vectors.sparse(1, [], []))],
         ["label", "weight", "features"])
     lr = LinearRegression(maxIter=5,
                           regParam=0.0,
                           solver="normal",
                           weightCol="weight")
     model = lr.fit(data)
     # the name of the input is 'features'
     C = model.numFeatures
     model_onnx = convert_sparkml(
         model, 'sparkml LinearRegressorBasic',
         [('features', FloatTensorType([None, C]))])
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlLinearRegressor_Basic")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Example #17
0
    def test_one_vs_rest(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)
        lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlOneVsRest")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #18
0
 def test_decision_tree_regressor(self):
     features = [[0, 1], [1, 1], [2, 0]]
     features = numpy.array(features, dtype=numpy.float32)
     labels = [100, -10, 50]
     dd = [(labels[i], Vectors.dense(features[i]))
           for i in range(len(labels))]
     data = self.spark.createDataFrame(
         self.spark.sparkContext.parallelize(dd),
         schema=["label", "features"])
     dt = DecisionTreeRegressor(labelCol="label", featuresCol="features")
     model = dt.fit(data)
     feature_count = data.select('features').first()[0].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml Decision Tree Regressor',
         [('features', FloatTensorType([None, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     predicted = model.transform(data)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32)
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlDecisionTreeRegressor")
     onnx_model_path = paths[-1]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Example #19
0
 def test_gbt_regressor(self):
     data = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                                        (0.0, Vectors.sparse(1, [], []))],
                                       ["label", "features"])
     gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
     model = gbt.fit(data)
     feature_count = data.first()[1].size
     model_onnx = convert_sparkml(
         model,
         'Sparkml GBTRegressor',
         [('features', FloatTensorType([1, feature_count]))],
         spark_session=self.spark)
     self.assertTrue(model_onnx is not None)
     # run the model
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(
         lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
     ]
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlGBTRegressor")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['prediction'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
Example #20
0
    def test_bucketizer(self):
        values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )]
        data = self.spark.createDataFrame(values, ["features"])
        model = Bucketizer(splits=[-float("inf"), 0.5, 1.4,
                                   float("inf")],
                           inputCol="features",
                           outputCol="buckets")

        feature_count = len(data.select('features').first())
        model_onnx = convert_sparkml(
            model, 'Sparkml Bucketizer',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.setHandleInvalid("error").transform(data)
        expected = predicted.select("buckets").toPandas().values.astype(
            numpy.float32)
        data_np = [data.toPandas().values.astype(numpy.float32)]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBucketizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['buckets'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #21
0
    def test_model_pipeline_2_stage(self):
        import inspect
        import os
        import numpy
        import pandas
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select(
            *cols).limit(1000).randomSplit([0.9, 0.1], seed=1)

        stages = []
        for col in cols:
            stages.append(
                StringIndexer(inputCol=col,
                              outputCol=col + '_index',
                              handleInvalid='skip'))
            stages.append(
                OneHotEncoderEstimator(inputCols=[col + '_index'],
                                       outputCols=[col + '_vec']))

        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(
            model, 'Sparkml Pipeline',
            [('workclass', StringTensorType([1, 1])),
             ('education', StringTensorType([1, 1])),
             ('marital_status', StringTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status':
            test_data.select('marital_status').toPandas().values
        }
        predicted_np = [
            predicted.toPandas().workclass_vec.apply(
                lambda x: pandas.Series(x.toArray())).values,
            predicted.toPandas().education_vec.apply(
                lambda x: pandas.Series(x.toArray())).values,
            predicted.toPandas().marital_status_vec.apply(
                lambda x: pandas.Series(x.toArray())).values
        ]
        expected = [
            numpy.asarray([expand_one_hot_vec(x) for x in row])
            for row in predicted_np
        ]
        dump_data_and_sparkml_model(data_np,
                                    expected,
                                    model,
                                    model_onnx,
                                    basename="SparkmlPipeline_2Stage")
Example #22
0
    def test_standard_scaler(self):
        data = self.spark.createDataFrame([(
            0,
            Vectors.dense([1.0, 0.1, -1.0]),
        ), (
            1,
            Vectors.dense([2.0, 1.1, 1.0]),
        ), (
            2,
            Vectors.dense([3.0, 10.1, 3.0]),
        )], ["id", "features"])
        scaler = StandardScaler(inputCol='features',
                                outputCol='scaled_features')
        model = scaler.fit(data)

        # the input names must match the inputCol(s) above
        model_onnx = convert_sparkml(model, 'Sparkml StandardScaler',
                                     [('features', FloatTensorType([1, 3]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().scaled_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlStandardScaler")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['scaled_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #23
0
    def test_model_binarizer(self):
        import numpy
        data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                          ["id", "feature"])
        model = Binarizer(inputCol='feature', outputCol='binarized')

        # the input name should match that of what StringIndexer.inputCol
        model_onnx = convert_sparkml(model, 'Sparkml Binarizer',
                                     [('feature', FloatTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("binarized").toPandas().values.astype(
            numpy.float32)
        data_np = data.select('feature').toPandas().values.astype(
            numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBinarizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['binarized'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #24
0
    def test_element_wise_product(self):
        data = self.spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), )],
                                          ["features"])
        model = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
                                   inputCol="features",
                                   outputCol="eprod")
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml ElementwiseProduct',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().eprod.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        ]
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlElementwiseProduct")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['eprod'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #25
0
    def _imputer_test_single(self):
        data = self.spark.createDataFrame([(1.0, float("nan")),
                                           (2.0, float("nan")),
                                           (float("nan"), 3.0), (4.0, 4.0),
                                           (5.0, 5.0)], ["a", "b"])
        imputer = Imputer(inputCols=["a"], outputCols=["out_a"])
        model = imputer.fit(data)

        # the input name should match the inputCols above
        model_onnx = convert_sparkml(model, 'Sparkml Imputer',
                                     [('a', FloatTensorType([None, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("out_a").toPandas().values.astype(
            numpy.float32)
        data_np = data.toPandas().a.values.astype(numpy.float32)
        data_np = data_np.reshape((-1, 1))
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlImputerSingle")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['out_a'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #26
0
def save_sparkml(
        model, path, initial_types=None, prototype=None,
        shape=None, dtype=None, spark_session=None):
    """
    Convert a spark model to onnx first and then save it to disk using `save_onnx`.
    We use onnxmltool to do the conversion from spark to ONNX and currently not all the
    spark models are supported by onnxmltools. A list of supported models can be found
    in the documentation.
    :param model: PySpark model object
    :param path: Path to which the object will be serialized
    :param initial_types: a python list. Each element is a tuple of a variable name and a type
        defined in onnxconverter_common.data_types. If initial type is empty, we'll guess the
        required information from prototype or infer it by using shape and dtype.
    :param prototype: A numpy array that gives shape and type information. This is ignored if
        initial_types is not None
    :param shape: Shape of the input to the model. Ignored if initial_types or prototype is not None
    :param dtype: redisai.DType object which represents the type of the input to the model.
        Ignored if initial_types or prototype is not None
    """
    if not utils.is_installed(['onnxmltools', 'pyspark']):
        raise RuntimeError('Please install onnxmltools & pyspark to use this feature.')
    from onnxmltools import convert_sparkml
    if initial_types is None:
        initial_types = [utils.guess_onnx_tensortype(prototype, shape, dtype)]
    if not isinstance(initial_types, list):
        raise TypeError((
            "`initial_types` has to be a list. "
            "If you have only one initial_type, put that into a list"))
    # TODO: test issue with passing different datatype for numerical values
    # known issue: https://github.com/onnx/onnxmltools/tree/master/onnxmltools/convert/sparkml
    serialized = convert_sparkml(model, initial_types=initial_types, spark_session=spark_session)
    save_onnx(serialized, path)
Example #27
0
    def test_model_polynomial_expansion(self):
        data = self.spark.createDataFrame(
            [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
             (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
             (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"])
        pca = PCA(k=2, inputCol="features", outputCol="pca_features")
        model = pca.fit(data)

        # the input name should match that of what StringIndexer.inputCol
        feature_count = data.first()[0].size
        N = data.count()
        model_onnx = convert_sparkml(
            model, 'Sparkml PCA',
            [('features', FloatTensorType([N, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().pca_features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlPCA")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['pca_features'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #28
0
    def test_random_forrest_regression(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register(
            "truncateFeatures",
            lambda x: SparseVector(feature_count, range(0, feature_count),
                                   x.toArray()[125:130]), VectorUDT())
        data = original_data.selectExpr(
            "cast(label as string) as label",
            "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label",
                                      outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features",
                                        outputCol="indexedFeatures",
                                        maxCategories=10,
                                        handleInvalid='error')

        rf = RandomForestRegressor(labelCol="indexedLabel",
                                   featuresCol="indexedFeatures",
                                   numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(
            model,
            'Sparkml RandomForest Regressor',
            [('label', StringTensorType([1, 1])),
             ('features', FloatTensorType([1, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data.limit(1))
        data_np = {
            'label':
            data.limit(1).toPandas().label.values,
            'features':
            data.limit(1).toPandas().features.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlRandomForestRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'],
                                               data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
Example #29
0
    def test_model_pipeline_3_stage(self):
        import inspect
        import os
        import numpy
        import pandas
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select(
            *cols).limit(1000).randomSplit([0.9, 0.1], seed=1)

        stages = []
        for col in cols:
            stages.append(
                StringIndexer(inputCol=col,
                              outputCol=col + '_index',
                              handleInvalid='skip'))
            # we need the dropLast option otherwise when assembled together (below)
            # we won't be able to expand the features without difficulties
            stages.append(
                OneHotEncoderEstimator(inputCols=[col + '_index'],
                                       outputCols=[col + '_vec'],
                                       dropLast=False))

        stages.append(
            VectorAssembler(inputCols=[c + '_vec' for c in cols],
                            outputCol='features'))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(
            model, 'Sparkml Pipeline',
            [('workclass', StringTensorType([1, 1])),
             ('education', StringTensorType([1, 1])),
             ('marital_status', StringTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status':
            test_data.select('marital_status').toPandas().values
        }
        predicted_np = predicted.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values
        dump_data_and_sparkml_model(data_np,
                                    predicted_np,
                                    model,
                                    model_onnx,
                                    basename="SparkmlPipeline_3Stage")
def log_model(spark, model, name, model_name, data_df):
    import mlflow.onnx
    import onnxmltools
    from onnxmltools.convert.common.data_types import FloatTensorType
    from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple
    initial_types = buildInitialTypesSimple(data_df)
    onnx_model = onnxmltools.convert_sparkml(model, name, initial_types, spark_session=spark)
    mlflow.onnx.log_model(onnx_model, "onnx-model", \
        registered_model_name=None if not model_name else f"{model_name}_onnx")
    mlflow.set_tag("version.onnx", onnx.__version__)
    mlflow.set_tag("version.onnxmltools", onnxmltools.__version__)