def test_element_wise_product(self): data = self.spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), )], ["features"]) model = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), inputCol="features", outputCol="eprod") feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml ElementwiseProduct', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = [ predicted.toPandas().eprod.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlElementwiseProduct") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['eprod'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_xgboost_booster_classifier_reg(self): x, y = make_classification(n_classes=2, n_features=5, n_samples=100, random_state=42, n_informative=3) y = y.astype(np.float32) + 0.567 x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42) data = DMatrix(x_train, label=y_train) model = train( { 'objective': 'reg:squarederror', 'n_estimators': 3, 'min_child_samples': 1 }, data) model_onnx = convert_xgboost( model, 'tree-based classifier', [('input', FloatTensorType([None, x.shape[1]]))]) dump_data_and_model( x_test.astype(np.float32), model, model_onnx, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBBoosterReg")
def test_gbt_classifier(self): raw_data = self.spark.createDataFrame( [(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) string_indexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = string_indexer.fit(raw_data) data = si_model.transform(raw_data) gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml GBT Classifier', [('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlGBTClassifier") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_xgboost_example_mnist(self): """ Train a simple xgboost model and store associated artefacts. """ X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) X_train = X_train.reshape((X_train.shape[0], -1)) X_test = X_test.reshape((X_test.shape[0], -1)) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf = XGBClassifier(objective="multi:softprob", n_jobs=-1) clf.fit(X_train, y_train) sh = [None, X_train.shape[1]] onnx_model = convert_xgboost(clf, initial_types=[('input', FloatTensorType(sh))]) dump_data_and_model( X_test.astype(np.float32), clf, onnx_model, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBoostExample")
def _test_one_class_classification_core(self, model): X = [[0., 1.], [1., 1.], [2., 0.]] y = [1, 1, 1] model.fit(X, y) model_onnx = convert_sklearn(model, 'tree-based classifier', [('input', FloatTensorType([1, 2]))]) self.assertTrue(model_onnx is not None)
def test_xgb_regressor(self): iris = load_diabetes() x = iris.data y = iris.target x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42) xgb = XGBRegressor() xgb.fit(x_train, y_train) conv_model = convert_xgboost(xgb, initial_types=[ ('input', FloatTensorType(shape=[None, None])) ]) self.assertTrue(conv_model is not None) dump_data_and_model( x_test.astype("float32"), xgb, conv_model, basename="SklearnXGBRegressor-Dec3", allow_failure="StrictVersion(" "onnx.__version__)" "< StrictVersion('1.3.0')", )
def test_aft_regression_survival(self): data = self.spark.createDataFrame( [(1.0, Vectors.dense(1.0), 1.0), (1e-40, Vectors.sparse(1, [], []), 0.0)], ["label", "features", "censor"]) gbt = AFTSurvivalRegression() model = gbt.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml AFTSurvivalRegression', [('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlAFTSurvivalRegression") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_maxabs_scaler(self): data = self.spark.createDataFrame([( 0, Vectors.dense([1.0, 0.1, -1.0]), ), ( 1, Vectors.dense([2.0, 1.1, 1.0]), ), ( 2, Vectors.dense([3.0, 10.1, 3.0]), )], ["id", "features"]) scaler = MaxAbsScaler(inputCol='features', outputCol='scaled_features') model = scaler.fit(data) # the input names must match the inputCol(s) above model_onnx = convert_sparkml( model, 'Sparkml MaxAbsScaler', [('features', FloatTensorType([None, 3]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().scaled_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlMaxAbsScaler") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['scaled_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_convert_nusvmc(self): iris = load_iris() X = iris.data[:, :2] y = iris.target y[y == 2] = 1 prob = svmutil.svm_problem(y, X.tolist()) param = svmutil.svm_parameter() param.svm_type = NuSVC param.kernel_type = svmutil.RBF param.eps = 1 param.probability = 1 if noprint: param.print_func = noprint libsvm_model = svmutil.svm_train(prob, param) node = convert(libsvm_model, "LibSvmNuSvmc", [('input', FloatTensorType(shape=['None', 'None']))]) self.assertTrue(node is not None) dump_data_and_model( X[:5].astype(numpy.float32), SkAPIClProba2(libsvm_model), node, basename="LibSvmNuSvmc-Dec2", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3')")
def test_convert_svmc_linear_raw_multi(self): iris = load_iris() X = iris.data[:, :2] y = iris.target y[-5:] = 3 prob = svmutil.svm_problem(y, X.tolist()) param = svmutil.svm_parameter() param.svm_type = SVC param.kernel_type = svmutil.LINEAR param.eps = 1 param.probability = 0 if noprint: param.print_func = noprint libsvm_model = svmutil.svm_train(prob, param) node = convert(libsvm_model, "LibSvmNuSvmcMultiRaw", [('input', FloatTensorType(shape=['None', 2]))]) self.assertTrue(node is not None) X2 = numpy.vstack([X[:2], X[60:62], X[110:112], X[147:149]]) # 5x0, 5x1 dump_data_and_model( X2.astype(numpy.float32), SkAPICl(libsvm_model), node, basename="LibSvmSvmcRaw-Dec3", verbose=False, allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3')")
def test_chi_sq_selector(self): data = self.spark.createDataFrame( [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], ["features", "label"]) selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") model = selector.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml ChiSqSelector', [('features', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().selectedFeatures.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlChiSqSelector") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['selectedFeatures'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_one_vs_rest(self): this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt") data = self.spark.read.format("libsvm").load(input_path) lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml( model, 'Sparkml OneVsRest', [('features', FloatTensorType([None, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneVsRest") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def convert(self, model, data, args, model_name): from skl2onnx import convert_sklearn from onnxmltools.convert.common.data_types import FloatTensorType self.configure(data, model, args) with Timer() as t: batch = min(len(data.X_test), self.params["batch_size"]) remainder = len(data.X_test) % batch initial_type = [("input", FloatTensorType([batch, self.params["input_size"]]))] self.model = convert_sklearn(model, initial_types=initial_type) if remainder > 0: initial_type = [("input", FloatTensorType([remainder, self.params["input_size"]]))] self.remainder_model = convert_sklearn(model, initial_types=initial_type, target_opset=11) return t.interval
def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )], ["features"]) pca = PCA(k=2, inputCol="features", outputCol="pca_features") model = pca.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml PCA', [('features', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().pca_features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPCA") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['pca_features'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_tree_one_class_classification(self): features = [[0., 1.], [1., 1.], [2., 0.]] features = numpy.array(features, dtype=numpy.float32) labels = [1, 1, 1] dd = [(labels[i], Vectors.dense(features[i])) for i in range(len(labels))] data = self.spark.createDataFrame( self.spark.sparkContext.parallelize(dd), schema=["label", "features"]) dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") model = dt.fit(data) model_onnx = convert_sparkml( model, 'Sparkml Decision Tree One Class', [('features', FloatTensorType([None, 2]))], spark_session=self.spark) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) predicted = model.transform(data) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), predicted.toPandas().probability.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlDecisionTreeBinaryClass") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['prediction', 'probability'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_onehot_encoder(self): encoder = OneHotEncoderEstimator(inputCols=['index'], outputCols=['indexVec']) data = self.spark.createDataFrame([(0.0, ), (1.0, ), (2.0, ), (2.0, ), (0.0, ), (2.0, )], ['index']) model = encoder.fit(data) model_onnx = convert_sparkml(model, 'Sparkml OneHotEncoder', [('index', FloatTensorType([1, 1]))]) self.assertTrue(model_onnx is not None) self.assertTrue(model_onnx.graph.node is not None) # run the model predicted = model.transform(data) data_np = data.select("index").toPandas().values.astype(numpy.float32) predicted_np = predicted.select("indexVec").toPandas().indexVec.apply( lambda x: x.toArray().tolist()).values expected = numpy.asarray( [x + [0] if numpy.amax(x) == 1 else x + [1] for x in predicted_np]) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneHotEncoder") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexVec'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def do_training(self, X, Y, create_onnx=True): print('Pre-processing ...') X = self.transform(X, do_fit=True) new_columns = {} for i, c in enumerate(X.columns): new_columns[c] = i X = X.rename(columns=new_columns) print('Training with ' + str(len(X.columns)) + ' columns ...') self.clfs = [] clf = xgb.XGBClassifier(n_estimators=1700, nthread=32, max_depth=6, learning_rate=0.024, subsample=0.8, colsample_bytree=0.65) this_model = clf.fit(X, Y) self.clfs.append(clf) if create_onnx: print('Converting models into ONNX ...') onnx_ml_models = [] for i, clf in enumerate(self.clfs): initial_type = [ ('dense_input', FloatTensorType([None, len(self.pipeline.output_columns)])) ] onnx_ml_models.append( convert_xgboost(clf, initial_types=initial_type)) self.create_onnx('mental_health', onnx_ml_models)
def test_vector_slicer(self): data = self.spark.createDataFrame( [(Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ), (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ), (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"]) model = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4]) feature_count = data.first()[0].array.size model_onnx = convert_sparkml( model, 'Sparkml VectorSlicer', [('features', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().sliced.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorSlicer") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['sliced'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_model_vector_indexer_single(self): vi = VectorIndexer(maxCategories=3, inputCol="a", outputCol="indexed") data = self.spark.createDataFrame([(Vectors.dense([-1.0]), ), (Vectors.dense([0.0]), ), (Vectors.dense([0.0]), )], ["a"]) model = vi.fit(data) model_onnx = convert_sparkml( model, 'Sparkml VectorIndexer Single', [('a', FloatTensorType([None, model.numFeatures]))], target_opset=9) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().indexed.apply( lambda x: pandas.Series(x.toArray())).values data_np = data.toPandas().a.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorIndexerSingle") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['indexed'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_convert_svmc(self): iris = load_iris() X = iris.data[:, :2] y = iris.target y[y == 2] = 1 prob = svmutil.svm_problem(y, X.tolist()) param = svmutil.svm_parameter() param.svm_type = SVC param.kernel_type = svmutil.RBF param.eps = 1 param.probability = 1 if noprint: param.print_func = noprint libsvm_model = svmutil.svm_train(prob, param) node = convert(libsvm_model, "LibSvmSvmc", [('input', FloatTensorType())]) self.assertTrue(node is not None) dump_data_and_model(X[:5].astype(numpy.float32), SkAPIClProba2(libsvm_model), node, basename="LibSvmSvmc-Dec2")
def test_model_polynomial_expansion(self): data = self.spark.createDataFrame( [(Vectors.dense([1.2, 3.2, 1.3, -5.6]), ), (Vectors.dense([4.3, -3.2, 5.7, 1.0]), ), (Vectors.dense([0, 3.2, 4.7, -8.9]), )], ["dense"]) model = PolynomialExpansion(degree=2, inputCol="dense", outputCol="expanded") # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml PolynomialExpansion', [('dense', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().expanded.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().dense.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlPolynomialExpansion") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['expanded'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_convert_svmc_raw(self): iris = load_iris() X = iris.data[:, :2] y = iris.target y[y == 2] = 1 prob = svmutil.svm_problem(y, X.tolist()) param = svmutil.svm_parameter() param.svm_type = SVC param.kernel_type = svmutil.RBF param.eps = 1 param.probability = 0 if noprint: param.print_func = noprint libsvm_model = svmutil.svm_train(prob, param) # known svm runtime dimension error in ONNX Runtime node = convert(libsvm_model, "LibSvmSvmcRaw", [('input', FloatTensorType(shape=['None', 'None']))]) self.assertTrue(node is not None) dump_data_and_model( X[:5].astype(numpy.float32), SkAPICl(libsvm_model), node, basename="LibSvmSvmcRaw", allow_failure= "StrictVersion(onnxruntime.__version__) < StrictVersion('0.5.0')")
def test_xgb_classifier_multi_discrete_int_labels(self): iris = load_iris() x = iris.data[:, :2] y = iris.target y[y == 0] = 10 y[y == 1] = 20 y[y == 2] = -30 x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5, random_state=42) xgb = XGBClassifier(n_estimators=3) xgb.fit(x_train, y_train) conv_model = convert_xgboost(xgb, initial_types=[ ('input', FloatTensorType(shape=[None, None])) ]) self.assertTrue(conv_model is not None) dump_data_and_model( x_test.astype("float32"), xgb, conv_model, basename="SklearnXGBClassifierMultiDiscreteIntLabels", allow_failure="StrictVersion(" "onnx.__version__)" "< StrictVersion('1.3.0')", )
def test_model_binarizer(self): data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)], ["id", "feature"]) model = Binarizer(inputCol='feature', outputCol='binarized') # the input name should match that of what StringIndexer.inputCol model_onnx = convert_sparkml(model, 'Sparkml Binarizer', [('feature', FloatTensorType([None, 1]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.select("binarized").toPandas().values.astype( numpy.float32) data_np = data.select('feature').toPandas().values.astype( numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlBinarizer") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['binarized'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def calculate_sparkml_scaler_output_shapes(operator): check_input_and_output_numbers(operator, output_count_range=1) check_input_and_output_types( operator, good_input_types=[FloatTensorType, Int64TensorType]) input_shape = copy.deepcopy(operator.inputs[0].type.shape) operator.outputs[0].type = FloatTensorType(input_shape)
def test_xgboost_classifier_i5450(self): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10) clr = XGBClassifier(objective="multi:softmax", max_depth=1, n_estimators=2) clr.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=40) initial_type = [('float_input', FloatTensorType([None, 4]))] onx = convert_xgboost(clr, initial_types=initial_type) sess = InferenceSession(onx.SerializeToString()) input_name = sess.get_inputs()[0].name label_name = sess.get_outputs()[1].name predict_list = [1., 20., 466., 0.] predict_array = np.array(predict_list).reshape( (1, -1)).astype(np.float32) pred_onx = sess.run([label_name], {input_name: predict_array})[0] pred_xgboost = sessresults = clr.predict_proba(predict_array) bst = clr.get_booster() bst.dump_model('dump.raw.txt') dump_data_and_model( X_test.astype(np.float32) + 1e-5, clr, onx, allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')", basename="XGBClassifierIris")
def test_random_forrest_regression(self): this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register( "truncateFeatures", lambda x: SparseVector(feature_count, range(0, feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr( "cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='error') rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml( model, 'Sparkml RandomForest Regressor', [('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(1)) data_np = { 'label': data.limit(1).toPandas().label.values, 'features': data.limit(1).toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def common_test_xgboost_10_skl(self, missing, replace=False): this = os.path.abspath(os.path.dirname(__file__)) data = os.path.join(this, "data_fail.csv") data = pandas.read_csv(data) for col in data: dtype = data[col].dtype if dtype in ['float64', 'float32']: data[col].fillna(0., inplace=True) if dtype in ['int64']: data[col].fillna(0, inplace=True) elif dtype in ['O']: data[col].fillna('N/A', inplace=True) data['pclass'] = data['pclass'] * float(1) full_df = data.drop('survived', axis=1) full_labels = data['survived'] train_df, test_df, train_labels, test_labels = train_test_split( full_df, full_labels, test_size=.2, random_state=11) col_transformer = self._column_tranformer_fitted_from_df(full_df) param_distributions = { "colsample_bytree": 0.5, "gamma": 0.2, 'learning_rate': 0.3, 'max_depth': 2, 'min_child_weight': 1., 'n_estimators': 1, 'missing': missing, } regressor = XGBRegressor(verbose=0, objective='reg:squarederror', **param_distributions) regressor.fit(col_transformer.transform(train_df), train_labels) model = Pipeline(steps=[('preprocessor', col_transformer), ('regressor', regressor)]) update_registered_converter(XGBRegressor, 'XGBRegressor', calculate_linear_regressor_output_shapes, convert_xgb) # last step input_xgb = model.steps[0][-1].transform(test_df[:5]).astype( np.float32) if replace: input_xgb[input_xgb[:, :] == missing] = np.nan onnx_last = convert_sklearn( model.steps[1][-1], initial_types=[ ('X', FloatTensorType(shape=[None, input_xgb.shape[1]])) ], target_opset=get_opset_number_from_onnx()) session = rt.InferenceSession(onnx_last.SerializeToString()) pred_skl = model.steps[1][-1].predict(input_xgb).ravel() pred_onx = session.run(None, {'X': input_xgb})[0].ravel() assert_almost_equal(pred_skl, pred_onx)
def test_max_abs_scaler(self): model = MaxAbsScaler() data = [[0., 0., 3.], [1., 1., 0.], [0., 2., 1.], [1., 0., 2.]] model.fit(data) model_onnx = convert_sklearn(model, 'scaler', [('input', FloatTensorType([1, 3]))]) self.assertTrue(model_onnx is not None) dump_data_and_model(numpy.array(data, dtype=numpy.float32), model, basename="SklearnMaxAbsScaler")
def test_xgboost_unpickle_06(self): # Unpickle a model trained with an old version of xgboost. this = os.path.dirname(__file__) with open(os.path.join(this, "xgboost10day.pickle.dat"), "rb") as f: xgb = pickle.load(f) conv_model = convert_xgboost(xgb, initial_types=[('features', FloatTensorType([1, 10000]))]) assert conv_model is not None