def test_dct(self): data = self.spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]), )], ["vec"]) model = DCT(inverse=False, inputCol="vec", outputCol="resultVec") # the input name should match that of what inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml( model, 'Sparkml DCT', [('vec', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().resultVec.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().vec.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDCT") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['resultVec'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_normalizer_2(self): data = self.spark.createDataFrame([(0, Vectors.dense(1.0, 0.5, -1.0)), (1, Vectors.dense(2.0, 1.0, 1.0)), (2, Vectors.dense(4.0, 10.0, 2.0)) ]).toDF("id", "features") model = Normalizer(inputCol='features', outputCol='norm_feature', p=2.0) model_onnx = convert_sparkml(model, 'Sparkml Normalizer', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().norm_feature.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlNormalizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['norm_feature'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_gbt_classifier(self): raw_data = self.spark.createDataFrame( [(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) string_indexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = string_indexer.fit(raw_data) data = si_model.transform(raw_data) gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml GBT Classifier', [('features', FloatTensorType([None, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTClassifier") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_stop_words_remover(self): data = self.spark.createDataFrame([(["a", "b", "c"], )], ["text"]) model = StopWordsRemover(inputCol="text", outputCol="words", stopWords=["b"]) feature_count = len(data.columns) model_onnx = convert_sparkml( model, 'Sparkml StopWordsRemover', [('text', StringTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().words.values data_np = data.toPandas().text.values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStopWordsRemover") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_logistic_regression_binary_class(self): import inspect import os this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr("label", "truncateFeatures(features) as features") lr = LogisticRegression(maxIter=100, tol=0.0001) model = lr.fit(data) # the name of the input for Logistic Regression is 'features' model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, model.numFeatures]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model import pandas predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) ] dump_data_and_sparkml_model(data_np, expected, model, model_onnx, basename="SparkmlLogisticRegression")
def test_model_vector_assembler(self): col_names = ["a", "b", "c"] model = VectorAssembler(inputCols=col_names, outputCol='features') data = self.spark.createDataFrame([(1., 0., 3.)], col_names) model_onnx = convert_sparkml(model, 'Sparkml VectorAssembler', [('a', FloatTensorType([None, 1])), ('b', FloatTensorType([None, 1])), ('c', FloatTensorType([None, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) expected = predicted.select("features").toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values data_np = { 'a': data.select('a').toPandas().values.astype(numpy.float32), 'b': data.select('b').toPandas().values.astype(numpy.float32), 'c': data.select('c').toPandas().values.astype(numpy.float32) } paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorAssembler") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_onehot_encoder(self): encoder = OneHotEncoderEstimator(inputCols=['index'], outputCols=['indexVec']) data = self.spark.createDataFrame([(0.0, ), (1.0, ), (2.0, ), (2.0, ), (0.0, ), (2.0, )], ['index']) model = encoder.fit(data) model_onnx = convert_sparkml(model, 'Sparkml OneHotEncoder', [('index', FloatTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) data_np = data.select("index").toPandas().values.astype(numpy.float32) predicted_np = predicted.select("indexVec").toPandas().indexVec.apply( lambda x: x.toArray().tolist()).values expected = numpy.asarray( [x + [0] if numpy.amax(x) == 1 else x + [1] for x in predicted_np]) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneHotEncoder") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexVec'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.dense([1.2, 3.2, 1.3, -5.6]), ), (Vectors.dense([4.3, -3.2, 5.7, 1.0]), ), (Vectors.dense([0, 3.2, 4.7, -8.9]), )], ["dense"]) model = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml PolynomialExpansion', [('dense', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().expanded.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().dense.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPolynomialExpansion") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['expanded'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_vector_slicer(self): data = self.spark.createDataFrame( [(Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ), (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ), (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"]) model = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4]) feature_count = data.first()[0].array.size model_onnx = convert_sparkml( model, 'Sparkml VectorSlicer', [('features', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().sliced.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorSlicer") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['sliced'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_vector_indexer_single(self): vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed") data = self.spark.createDataFrame([(Vectors.dense([-1.0]), ), (Vectors.dense([0.0]), ), (Vectors.dense([0.0]), )], ["a"]) model = vi.fit(data) model_onnx = convert_sparkml( model, 'Sparkml VectorIndexer Single', [('a', FloatTensorType([None, model.numFeatures]))], target_opset=9) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().indexed.apply( lambda x: pandas.Series(x.toArray())).values data_np = data.toPandas().a.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorIndexerSingle") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['indexed'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_chi_sq_selector(self): data = self.spark.createDataFrame( [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], ["features", "label"]) selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") model = selector.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml ChiSqSelector', [('features', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().selectedFeatures.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlChiSqSelector") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['selectedFeatures'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_tree_one_class_classification(self): features = [[0., 1.], [1., 1.], [2., 0.]] features = numpy.array(features, dtype=numpy.float32) labels = [1, 1, 1] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame( self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") model = dt.fit(data) model_onnx = convert_sparkml( model, 'Sparkml Decision Tree One Class', [('features', FloatTensorType([None, 2]))], spark_session=self.spark) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeBinaryClass") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_index_to_string(self): original_data = self.spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) string_indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") string_indexer_model = string_indexer.fit(original_data) data = string_indexer_model.transform(original_data) model = IndexToString(inputCol="categoryIndex", outputCol="originalCategory", labels=['A', 'B', 'C']) # the input name should match that of what IndexToString.inputCol model_onnx = convert_sparkml( model, 'Sparkml IndexToString', [('categoryIndex', Int64TensorType([1, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("originalCategory").toPandas().values data_np = data.select('categoryIndex').toPandas().values.astype( numpy.int64) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlIndexToString") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['originalCategory'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_string_indexer(self): indexer = StringIndexer(inputCol='cat1', outputCol='cat1_index', handleInvalid='skip') data = self.spark.createDataFrame([("a", ), ("b", ), ("c", ), ("a", ), ("a", ), ("c", )], ['cat1']) model = indexer.fit(data) # the input name should match that of what StringIndexer.inputCol model_onnx = convert_sparkml(model, 'Sparkml StringIndexer', [('cat1', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) expected = predicted.select("cat1_index").toPandas().values data_np = data.select('cat1').toPandas().values paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStringIndexer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['cat1_index'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_generalized_linear_regression(self): this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_linear_regression_data.txt") data = self.spark.read.format("libsvm").load(input_path) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) model = lr.fit(data) # the name of the input is 'features' C = model.numFeatures model_onnx = convert_sparkml( model, 'sparkml GeneralizedLinearRegression', [('features', FloatTensorType([None, C]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGeneralizedLinearRegression") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_linear_regression_basic(self): data = self.spark.createDataFrame( [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") model = lr.fit(data) # the name of the input is 'features' C = model.numFeatures model_onnx = convert_sparkml( model, 'sparkml LinearRegressorBasic', [('features', FloatTensorType([None, C]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlLinearRegressor_Basic") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_one_vs_rest(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt") data = self.spark.read.format("libsvm").load(input_path) lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneVsRest") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_decision_tree_regressor(self): features = [[0, 1], [1, 1], [2, 0]] features = numpy.array(features, dtype=numpy.float32) labels = [100, -10, 50] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame( self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeRegressor(labelCol="label", featuresCol="features") model = dt.fit(data) feature_count = data.select('features').first()[0].size model_onnx = convert_sparkml( model, 'Sparkml Decision Tree Regressor', [('features', FloatTensorType([None, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeRegressor") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_gbt_regressor(self): data = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml GBTRegressor', [('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_bucketizer(self): values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )] data = self.spark.createDataFrame(values, ["features"]) model = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")], inputCol="features", outputCol="buckets") feature_count = len(data.select('features').first()) model_onnx = convert_sparkml( model, 'Sparkml Bucketizer', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.setHandleInvalid("error").transform(data) expected = predicted.select("buckets").toPandas().values.astype( numpy.float32) data_np = [data.toPandas().values.astype(numpy.float32)] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBucketizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['buckets'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_pipeline_2_stage(self): import inspect import os import numpy import pandas this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select( *cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append( StringIndexer(inputCol=col, outputCol=col + '_index', handleInvalid='skip')) stages.append( OneHotEncoderEstimator(inputCols=[col + '_index'], outputCols=[col + '_vec'])) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml( model, 'Sparkml Pipeline', [('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } predicted_np = [ predicted.toPandas().workclass_vec.apply( lambda x: pandas.Series(x.toArray())).values, predicted.toPandas().education_vec.apply( lambda x: pandas.Series(x.toArray())).values, predicted.toPandas().marital_status_vec.apply( lambda x: pandas.Series(x.toArray())).values ] expected = [ numpy.asarray([expand_one_hot_vec(x) for x in row]) for row in predicted_np ] dump_data_and_sparkml_model(data_np, expected, model, model_onnx, basename="SparkmlPipeline_2Stage")
def test_standard_scaler(self): data = self.spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -1.0]), ), ( 1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) scaler = StandardScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml StandardScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStandardScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_binarizer(self): import numpy data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)], ["id", "feature"]) model = Binarizer(inputCol='feature', outputCol='binarized') # the input name should match that of what StringIndexer.inputCol model_onnx = convert_sparkml(model, 'Sparkml Binarizer', [('feature', FloatTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("binarized").toPandas().values.astype( numpy.float32) data_np = data.select('feature').toPandas().values.astype( numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBinarizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['binarized'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_element_wise_product(self): data = self.spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), )], ["features"]) model = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), inputCol="features", outputCol="eprod") feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml ElementwiseProduct', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = [ predicted.toPandas().eprod.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlElementwiseProduct") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['eprod'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def _imputer_test_single(self): data = self.spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)], ["a", "b"]) imputer = Imputer(inputCols=["a"], outputCols=["out_a"]) model = imputer.fit(data) # the input name should match the inputCols above model_onnx = convert_sparkml(model, 'Sparkml Imputer', [('a', FloatTensorType([None, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("out_a").toPandas().values.astype( numpy.float32) data_np = data.toPandas().a.values.astype(numpy.float32) data_np = data_np.reshape((-1, 1)) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlImputerSingle") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['out_a'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def save_sparkml( model, path, initial_types=None, prototype=None, shape=None, dtype=None, spark_session=None): """ Convert a spark model to onnx first and then save it to disk using `save_onnx`. We use onnxmltool to do the conversion from spark to ONNX and currently not all the spark models are supported by onnxmltools. A list of supported models can be found in the documentation. :param model: PySpark model object :param path: Path to which the object will be serialized :param initial_types: a python list. Each element is a tuple of a variable name and a type defined in onnxconverter_common.data_types. If initial type is empty, we'll guess the required information from prototype or infer it by using shape and dtype. :param prototype: A numpy array that gives shape and type information. This is ignored if initial_types is not None :param shape: Shape of the input to the model. Ignored if initial_types or prototype is not None :param dtype: redisai.DType object which represents the type of the input to the model. Ignored if initial_types or prototype is not None """ if not utils.is_installed(['onnxmltools', 'pyspark']): raise RuntimeError('Please install onnxmltools & pyspark to use this feature.') from onnxmltools import convert_sparkml if initial_types is None: initial_types = [utils.guess_onnx_tensortype(prototype, shape, dtype)] if not isinstance(initial_types, list): raise TypeError(( "`initial_types` has to be a list. " "If you have only one initial_type, put that into a list")) # TODO: test issue with passing different datatype for numerical values # known issue: https://github.com/onnx/onnxmltools/tree/master/onnxmltools/convert/sparkml serialized = convert_sparkml(model, initial_types=initial_types, spark_session=spark_session) save_onnx(serialized, path)
def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml( model, 'Sparkml PCA', [('features', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().pca_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_random_forrest_regression(self): this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register( "truncateFeatures", lambda x: SparseVector(feature_count, range(0, feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr( "cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='error') rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml( model, 'Sparkml RandomForest Regressor', [('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(1)) data_np = { 'label': data.limit(1).toPandas().label.values, 'features': data.limit(1).toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_pipeline_3_stage(self): import inspect import os import numpy import pandas this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv") full_data = self.spark.read.format('csv')\ .options(header='true', inferschema='true').load(input_path) cols = ['workclass', 'education', 'marital_status'] training_data, test_data = full_data.select( *cols).limit(1000).randomSplit([0.9, 0.1], seed=1) stages = [] for col in cols: stages.append( StringIndexer(inputCol=col, outputCol=col + '_index', handleInvalid='skip')) # we need the dropLast option otherwise when assembled together (below) # we won't be able to expand the features without difficulties stages.append( OneHotEncoderEstimator(inputCols=[col + '_index'], outputCols=[col + '_vec'], dropLast=False)) stages.append( VectorAssembler(inputCols=[c + '_vec' for c in cols], outputCol='features')) pipeline = Pipeline(stages=stages) model = pipeline.fit(training_data) model_onnx = convert_sparkml( model, 'Sparkml Pipeline', [('workclass', StringTensorType([1, 1])), ('education', StringTensorType([1, 1])), ('marital_status', StringTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(test_data) data_np = { 'workclass': test_data.select('workclass').toPandas().values, 'education': test_data.select('education').toPandas().values, 'marital_status': test_data.select('marital_status').toPandas().values } predicted_np = predicted.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values dump_data_and_sparkml_model(data_np, predicted_np, model, model_onnx, basename="SparkmlPipeline_3Stage")
def log_model(spark, model, name, model_name, data_df): import mlflow.onnx import onnxmltools from onnxmltools.convert.common.data_types import FloatTensorType from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple initial_types = buildInitialTypesSimple(data_df) onnx_model = onnxmltools.convert_sparkml(model, name, initial_types, spark_session=spark) mlflow.onnx.log_model(onnx_model, "onnx-model", \ registered_model_name=None if not model_name else f"{model_name}_onnx") mlflow.set_tag("version.onnx", onnx.__version__) mlflow.set_tag("version.onnxmltools", onnxmltools.__version__)