def _imputer_test_multi(self): data = self.spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")), (float("nan"), 3.0), (4.0, 4.0), (5.0, 5.0)], ["a", "b"]) imputer = Imputer(inputCols=["a", "b"], outputCols=["out_a", "out_b"]) model = imputer.fit(data) # the input name should match the inputCols above model_onnx = convert_sparkml(model, 'Sparkml Imputer Multi Input', [('a', FloatTensorType([None, 1])), ('b', FloatTensorType([None, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("out_a", "out_b").toPandas().values.astype( numpy.float32) data_np = data.toPandas().values.astype(numpy.float32) data_np = {'a': data_np[:, :1], 'b': data_np[:, 1:]} paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlImputerMulti") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['out_a', 'out_b'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_vector_assembler(self): col_names = ["a", "b", "c"] model = VectorAssembler(inputCols=col_names, outputCol='features') data = self.spark.createDataFrame([(1., 0., 3.)], col_names) model_onnx = convert_sparkml(model, 'Sparkml VectorAssembler', [('a', FloatTensorType([None, 1])), ('b', FloatTensorType([None, 1])), ('c', FloatTensorType([None, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) expected = predicted.select("features").toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values data_np = { 'a': data.select('a').toPandas().values.astype(numpy.float32), 'b': data.select('b').toPandas().values.astype(numpy.float32), 'c': data.select('c').toPandas().values.astype(numpy.float32) } paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorAssembler") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def calculate_sparkml_naive_bayes_output_shapes(operator): check_input_and_output_numbers(operator, output_count_range=2) check_input_and_output_types(operator, good_input_types=[FloatTensorType], good_output_types=[FloatTensorType,FloatTensorType]) N = operator.inputs[0].type.shape[0] C = operator.raw_operator.numClasses operator.outputs[0].type = FloatTensorType([N, 1]) operator.outputs[1].type = FloatTensorType([N, C])
def test_combine_inputs(self): from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline scaler = StandardScaler() scaler.fit([[0., 0.], [0., 0.], [1., 1.], [1., 1.]]) model = Pipeline([('scaler1', scaler), ('scaler2', scaler)]) model_onnx = convert_sklearn(model, 'pipeline', [('input1', FloatTensorType([1, 1])), ('input2', FloatTensorType([1, 1]))]) self.assertTrue(len(model_onnx.graph.node[-1].output) == 1) self.assertTrue(model_onnx is not None)
def test_xgboost_booster_classifier_reg(self): x, y = make_classification(n_classes=2, n_features=5, n_samples=100, random_state=42, n_informative=3) y = y.astype(np.float32) + 0.567 x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42) data = DMatrix(x_train, label=y_train) model = train( { 'objective': 'reg:squarederror', 'n_estimators': 3, 'min_child_samples': 1 }, data) model_onnx = convert_xgboost( model, 'tree-based classifier', [('input', FloatTensorType([None, x.shape[1]]))]) dump_data_and_model( x_test.astype(np.float32), model, model_onnx, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBBoosterReg")
def test_one_vs_rest(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt") data = self.spark.read.format("libsvm").load(input_path) lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneVsRest") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_xgb_classifier_multi_discrete_int_labels(self): iris = load_iris() x = iris.data[:, :2] y = iris.target y[y == 0] = 10 y[y == 1] = 20 y[y == 2] = -30 x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42) xgb = XGBClassifier(n_estimators=3) xgb.fit(x_train, y_train) conv_model = convert_xgboost(xgb, initial_types=[ ('input', FloatTensorType(shape=[None, None])) ]) self.assertTrue(conv_model is not None) dump_data_and_model( x_test.astype("float32"), xgb, conv_model, basename="SklearnXGBClassifierMultiDiscreteIntLabels", allow_failure="StrictVersion(" "onnx.__version__)" "< StrictVersion('1.3.0')", )
def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml( model, 'Sparkml PCA', [('features', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().pca_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def do_training(self, X, Y, create_onnx=True): print('Pre-processing ...') X = self.transform(X, do_fit=True) new_columns = {} for i, c in enumerate(X.columns): new_columns[c] = i X = X.rename(columns=new_columns) print('Training with ' + str(len(X.columns)) + ' columns ...') self.clfs = [] clf = xgb.XGBClassifier(n_estimators=1700, nthread=32, max_depth=6, learning_rate=0.024, subsample=0.8, colsample_bytree=0.65) xgb_model = clf.fit(X, Y, eval_metric="auc", verbose=True) self.clfs.append(clf) if create_onnx: print('Converting models into ONNX ...') onnx_ml_models = [] for i, clf in enumerate(self.clfs): initial_type = [ ('dense_input', FloatTensorType([None, len(self.pipeline.output_columns)])) ] onnx_ml_models.append( convert_xgboost(clf, initial_types=initial_type)) self.create_onnx('insurance', onnx_ml_models)
def test_xgboost_example_mnist(self): """ Train a simple xgboost model and store associated artefacts. """ X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) X_train = X_train.reshape((X_train.shape[0], -1)) X_test = X_test.reshape((X_test.shape[0], -1)) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf = XGBClassifier(objective="multi:softprob", n_jobs=-1) clf.fit(X_train, y_train) sh = [None, X_train.shape[1]] onnx_model = convert_xgboost(clf, initial_types=[('input', FloatTensorType(sh))]) dump_data_and_model( X_test.astype(np.float32), clf, onnx_model, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBoostExample")
def test_xgb_regressor(self): this = os.path.dirname(__file__) df = pandas.read_csv(os.path.join(this, "data_fail_empty.csv")) X, y = df.drop('y', axis=1), df['y'] X_train, X_test, y_train, y_test = train_test_split(X, y) clr = XGBClassifier(max_delta_step=0, tree_method='hist', n_estimators=100, booster='gbtree', objective='binary:logistic', eval_metric='logloss', learning_rate=0.1, gamma=10, max_depth=7, min_child_weight=50, subsample=0.75, colsample_bytree=0.75, random_state=42, verbosity=0) clr.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=40) initial_type = [('float_input', FloatTensorType([None, 797]))] onx = convert_xgboost(clr, initial_types=initial_type) expected = clr.predict(X_test), clr.predict_proba(X_test) sess = InferenceSession(onx.SerializeToString()) X_test = X_test.values.astype(np.float32) got = sess.run(None, {'float_input': X_test}) assert_almost_equal(expected[1], got[1]) assert_almost_equal(expected[0], got[0])
def test_convert_svmc_linear_raw_multi(self): iris = load_iris() X = iris.data[:, :2] y = iris.target y[-5:] = 3 prob = svmutil.svm_problem(y, X.tolist()) param = svmutil.svm_parameter() param.svm_type = SVC param.kernel_type = svmutil.LINEAR param.eps = 1 param.probability = 0 if noprint: param.print_func = noprint libsvm_model = svmutil.svm_train(prob, param) node = convert(libsvm_model, "LibSvmNuSvmcMultiRaw", [('input', FloatTensorType(shape=['None', 2]))]) self.assertTrue(node is not None) X2 = numpy.vstack([X[:2], X[60:62], X[110:112], X[147:149]]) # 5x0, 5x1 dump_data_and_model( X2.astype(numpy.float32), SkAPICl(libsvm_model), node, basename="LibSvmSvmcRaw-Dec3", verbose=False, allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3')")
def test_model_binarizer(self): import numpy data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)], ["id", "feature"]) model = Binarizer(inputCol='feature', outputCol='binarized') # the input name should match that of what StringIndexer.inputCol model_onnx = convert_sparkml(model, 'Sparkml Binarizer', [('feature', FloatTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("binarized").toPandas().values.astype( numpy.float32) data_np = data.select('feature').toPandas().values.astype( numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBinarizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['binarized'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_convert_nusvmc(self): iris = load_iris() X = iris.data[:, :2] y = iris.target y[y == 2] = 1 prob = svmutil.svm_problem(y, X.tolist()) param = svmutil.svm_parameter() param.svm_type = NuSVC param.kernel_type = svmutil.RBF param.eps = 1 param.probability = 1 if noprint: param.print_func = noprint libsvm_model = svmutil.svm_train(prob, param) node = convert(libsvm_model, "LibSvmNuSvmc", [('input', FloatTensorType(shape=['None', 'None']))]) self.assertTrue(node is not None) dump_data_and_model( X[:5].astype(numpy.float32), SkAPIClProba2(libsvm_model), node, basename="LibSvmNuSvmc-Dec2", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3')")
def test_convert_svmc_raw(self): iris = load_iris() X = iris.data[:, :2] y = iris.target y[y == 2] = 1 prob = svmutil.svm_problem(y, X.tolist()) param = svmutil.svm_parameter() param.svm_type = SVC param.kernel_type = svmutil.RBF param.eps = 1 param.probability = 0 if noprint: param.print_func = noprint libsvm_model = svmutil.svm_train(prob, param) # known svm runtime dimension error in ONNX Runtime node = convert(libsvm_model, "LibSvmSvmcRaw", [('input', FloatTensorType(shape=['None', 'None']))]) self.assertTrue(node is not None) dump_data_and_model( X[:5].astype(numpy.float32), SkAPICl(libsvm_model), node, basename="LibSvmSvmcRaw", allow_failure= "StrictVersion(onnxruntime.__version__) < StrictVersion('0.5.0')")
def test_standard_scaler(self): data = self.spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -1.0]), ), ( 1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) scaler = StandardScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml(model, 'Sparkml StandardScaler', [('features', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlStandardScaler") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_element_wise_product(self): data = self.spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), )], ["features"]) model = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), inputCol="features", outputCol="eprod") feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml ElementwiseProduct', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = [ predicted.toPandas().eprod.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlElementwiseProduct") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['eprod'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_dct(self): data = self.spark.createDataFrame([(Vectors.dense([5.0, 8.0, 6.0]), )], ["vec"]) model = DCT(inverse=False, inputCol="vec", outputCol="resultVec") # the input name should match that of what inputCol feature_count = data.first()[0].size N = data.count() model_onnx = convert_sparkml( model, 'Sparkml DCT', [('vec', FloatTensorType([N, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().resultVec.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().vec.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDCT") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['resultVec'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_xgboost_classifier_i5450(self): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10) clr = XGBClassifier(objective="multi:softmax", max_depth=1, n_estimators=2) clr.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=40) initial_type = [('float_input', FloatTensorType([None, 4]))] onx = convert_xgboost(clr, initial_types=initial_type) sess = InferenceSession(onx.SerializeToString()) input_name = sess.get_inputs()[0].name label_name = sess.get_outputs()[1].name predict_list = [1., 20., 466., 0.] predict_array = np.array(predict_list).reshape( (1, -1)).astype(np.float32) pred_onx = sess.run([label_name], {input_name: predict_array})[0] pred_xgboost = sessresults = clr.predict_proba(predict_array) bst = clr.get_booster() bst.dump_model('dump.raw.txt') dump_data_and_model( X_test.astype(np.float32) + 1e-5, clr, onx, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBClassifierIris")
def test_lightgbm_booster_multi_classifier(self): X = [[0, 1], [1, 1], [2, 0], [1, 2], [-1, 2], [1, -2]] X = numpy.array(X, dtype=numpy.float32) y = [0, 1, 0, 1, 2, 2] data = lightgbm.Dataset(X, label=y) model = lightgbm.train( { 'boosting_type': 'gbdt', 'objective': 'multiclass', 'n_estimators': 3, 'min_child_samples': 1, 'num_class': 3 }, data) model_onnx, prefix = convert_model( model, 'tree-based classifier', [('input', FloatTensorType([None, 2]))]) dump_data_and_model( X, model, model_onnx, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename=prefix + "BoosterBin" + model.__class__.__name__) try: from onnxruntime import InferenceSession except ImportError: # onnxruntime not installed (python 2.7) return sess = InferenceSession(model_onnx.SerializeToString()) out = sess.get_outputs() names = [o.name for o in out] assert names == ['label', 'probabilities']
def test_xgb_regressor(self): iris = load_diabetes() x = iris.data y = iris.target x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42) xgb = XGBRegressor() xgb.fit(x_train, y_train) conv_model = convert_xgboost(xgb, initial_types=[ ('input', FloatTensorType(shape=[None, None])) ]) self.assertTrue(conv_model is not None) dump_data_and_model( x_test.astype("float32"), xgb, conv_model, basename="SklearnXGBRegressor-Dec3", allow_failure="StrictVersion(" "onnx.__version__)" "< StrictVersion('1.3.0')", )
def test_model_linear_regression_basic(self): data = self.spark.createDataFrame( [(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight") model = lr.fit(data) # the name of the input is 'features' C = model.numFeatures model_onnx = convert_sparkml( model, 'sparkml LinearRegressorBasic', [('features', FloatTensorType([None, C]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlLinearRegressor_Basic") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def _test_scaler_converter(self, model): warnings.filterwarnings("ignore") X = np.array([[0.0, 0.0, 3.0], [1.0, -1.0, 0.0], [0.0, 2.0, 1.0], [1.0, 0.0, -2.0]], dtype=np.float32) model.fit(X) # Create ONNX-ML model onnx_ml_model = convert_sklearn( model, initial_types=[("float_input", FloatTensorType([None, X.shape[1]]))]) # Create ONNX model by calling converter onnx_model = convert(onnx_ml_model, "onnx", X) # Get the predictions for the ONNX-ML model session = ort.InferenceSession(onnx_ml_model.SerializeToString()) output_names = [ session.get_outputs()[i].name for i in range(len(session.get_outputs())) ] inputs = {session.get_inputs()[0].name: X} onnx_ml_pred = session.run(output_names, inputs)[0] # Get the predictions for the ONNX model onnx_pred = onnx_model.transform(X) return onnx_ml_pred, onnx_pred
def test_gbt_classifier(self): raw_data = self.spark.createDataFrame( [(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) string_indexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = string_indexer.fit(raw_data) data = si_model.transform(raw_data) gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml GBT Classifier', [('features', FloatTensorType([None, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTClassifier") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_decision_tree_regressor(self): features = [[0, 1], [1, 1], [2, 0]] features = numpy.array(features, dtype=numpy.float32) labels = [100, -10, 50] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame( self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeRegressor(labelCol="label", featuresCol="features") model = dt.fit(data) feature_count = data.select('features').first()[0].size model_onnx = convert_sparkml( model, 'Sparkml Decision Tree Regressor', [('features', FloatTensorType([None, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeRegressor") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_convert_svmc(self): iris = load_iris() X = iris.data[:, :2] y = iris.target y[y == 2] = 1 prob = svmutil.svm_problem(y, X.tolist()) param = svmutil.svm_parameter() param.svm_type = SVC param.kernel_type = svmutil.RBF param.eps = 1 param.probability = 1 if noprint: param.print_func = noprint libsvm_model = svmutil.svm_train(prob, param) node = convert(libsvm_model, "LibSvmSvmc", [('input', FloatTensorType())]) self.assertTrue(node is not None) dump_data_and_model(X[:5].astype(numpy.float32), SkAPIClProba2(libsvm_model), node, basename="LibSvmSvmc-Dec2")
def do_training(self, X, Y, create_onnx=True): print('Pre-processing ...') X = self.transform(X, do_fit=True) print('Training ...') C = 0.12 self.clfs = [] clf = LogisticRegression(C=C, solver='lbfgs', max_iter=1000, verbose=1, n_jobs=32) clf.fit(X, Y) self.clfs.append(clf) if create_onnx: print('Converting models into ONNX ...') onnx_ml_models = [] for i, clf in enumerate(self.clfs): initial_type = [ ('dense_input', FloatTensorType([None, len(self.pipeline.output_columns)])) ] onnx_ml_models.append( convert_sklearn(clf, initial_types=initial_type, options={type(clf): { 'zipmap': False }})) self.create_onnx('categorical-encoding', onnx_ml_models)
def test_gbt_regressor(self): data = self.spark.createDataFrame([(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml GBTRegressor', [('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_generalized_linear_regression(self): this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_linear_regression_data.txt") data = self.spark.read.format("libsvm").load(input_path) lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) model = lr.fit(data) # the name of the input is 'features' C = model.numFeatures model_onnx = convert_sparkml( model, 'sparkml GeneralizedLinearRegression', [('features', FloatTensorType([None, C]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGeneralizedLinearRegression") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_bucketizer(self): values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )] data = self.spark.createDataFrame(values, ["features"]) model = Bucketizer(splits=[-float("inf"), 0.5, 1.4, float("inf")], inputCol="features", outputCol="buckets") feature_count = len(data.select('features').first()) model_onnx = convert_sparkml( model, 'Sparkml Bucketizer', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.setHandleInvalid("error").transform(data) expected = predicted.select("buckets").toPandas().values.astype( numpy.float32) data_np = [data.toPandas().values.astype(numpy.float32)] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBucketizer") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['buckets'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)