コード例 #1
0
    def test_model_pipeline_2_stage(self):
        import inspect
        import os
        import numpy
        import pandas
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select(
            *cols).limit(1000).randomSplit([0.9, 0.1], seed=1)

        stages = []
        for col in cols:
            stages.append(
                StringIndexer(inputCol=col,
                              outputCol=col + '_index',
                              handleInvalid='skip'))
            stages.append(
                OneHotEncoderEstimator(inputCols=[col + '_index'],
                                       outputCols=[col + '_vec']))

        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(
            model, 'Sparkml Pipeline',
            [('workclass', StringTensorType([1, 1])),
             ('education', StringTensorType([1, 1])),
             ('marital_status', StringTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status':
            test_data.select('marital_status').toPandas().values
        }
        predicted_np = [
            predicted.toPandas().workclass_vec.apply(
                lambda x: pandas.Series(x.toArray())).values,
            predicted.toPandas().education_vec.apply(
                lambda x: pandas.Series(x.toArray())).values,
            predicted.toPandas().marital_status_vec.apply(
                lambda x: pandas.Series(x.toArray())).values
        ]
        expected = [
            numpy.asarray([expand_one_hot_vec(x) for x in row])
            for row in predicted_np
        ]
        dump_data_and_sparkml_model(data_np,
                                    expected,
                                    model,
                                    model_onnx,
                                    basename="SparkmlPipeline_2Stage")
コード例 #2
0
    def test_model_pipeline_3_stage(self):
        import inspect
        import os
        import numpy
        import pandas
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select(
            *cols).limit(1000).randomSplit([0.9, 0.1], seed=1)

        stages = []
        for col in cols:
            stages.append(
                StringIndexer(inputCol=col,
                              outputCol=col + '_index',
                              handleInvalid='skip'))
            # we need the dropLast option otherwise when assembled together (below)
            # we won't be able to expand the features without difficulties
            stages.append(
                OneHotEncoderEstimator(inputCols=[col + '_index'],
                                       outputCols=[col + '_vec'],
                                       dropLast=False))

        stages.append(
            VectorAssembler(inputCols=[c + '_vec' for c in cols],
                            outputCol='features'))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(
            model, 'Sparkml Pipeline',
            [('workclass', StringTensorType([1, 1])),
             ('education', StringTensorType([1, 1])),
             ('marital_status', StringTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status':
            test_data.select('marital_status').toPandas().values
        }
        predicted_np = predicted.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values
        dump_data_and_sparkml_model(data_np,
                                    predicted_np,
                                    model,
                                    model_onnx,
                                    basename="SparkmlPipeline_3Stage")
コード例 #3
0
    def test_combine_inputs_with_string(self):
        from sklearn.preprocessing import OneHotEncoder
        from sklearn.preprocessing import StandardScaler
        from sklearn.preprocessing import LabelEncoder
        from sklearn.pipeline import make_pipeline

        model = LabelEncoder()
        model.fit(['a', 'b', 'b', 'a', 'c'])

        model_onnx = convert_sklearn(model, 'pipeline',
                                     [('input1', StringTensorType([1, 1])),
                                      ('input2', StringTensorType([1, 4]))])
        self.assertTrue(len(model_onnx.graph.node[-1].output) == 1)
        self.assertTrue(model_onnx is not None)
コード例 #4
0
    def test_label_encoder_converter(self):
        model = LabelEncoder()
        model.fit(['str3', 'str2', 'str0', 'str1', 'str3'])

        model_onnx = convert_sklearn(model, 'scikit-learn label encoder',
                                     [('input', StringTensorType([1, 1]))])
        self.assertTrue(model_onnx.graph.node is not None)
コード例 #5
0
    def test_stop_words_remover(self):
        data = self.spark.createDataFrame([(["a", "b", "c"], )], ["text"])
        model = StopWordsRemover(inputCol="text",
                                 outputCol="words",
                                 stopWords=["b"])

        feature_count = len(data.columns)
        model_onnx = convert_sparkml(
            model, 'Sparkml StopWordsRemover',
            [('text', StringTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().words.values
        data_np = data.toPandas().text.values
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlStopWordsRemover")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #6
0
 def test_model_string_indexer(self):
     indexer = StringIndexer(inputCol='cat1',
                             outputCol='cat1_index',
                             handleInvalid='skip')
     data = self.spark.createDataFrame([("a", ), ("b", ), ("c", ), ("a", ),
                                        ("a", ), ("c", )], ['cat1'])
     model = indexer.fit(data)
     # the input name should match that of what StringIndexer.inputCol
     model_onnx = convert_sparkml(model, 'Sparkml StringIndexer',
                                  [('cat1', StringTensorType([1, 1]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     predicted = model.transform(data)
     expected = predicted.select("cat1_index").toPandas().values
     data_np = data.select('cat1').toPandas().values
     paths = save_data_models(data_np,
                              expected,
                              model,
                              model_onnx,
                              basename="SparkmlStringIndexer")
     onnx_model_path = paths[3]
     output, output_shapes = run_onnx_model(['cat1_index'], data_np,
                                            onnx_model_path)
     compare_results(expected, output, decimal=5)
コード例 #7
0
    def test_random_forrest_regression(self):
        this_script_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data",
                                  "sample_libsvm_data.txt")
        original_data = self.spark.read.format("libsvm").load(input_path)
        #
        # truncate the features
        #
        feature_count = 5
        self.spark.udf.register(
            "truncateFeatures",
            lambda x: SparseVector(feature_count, range(0, feature_count),
                                   x.toArray()[125:130]), VectorUDT())
        data = original_data.selectExpr(
            "cast(label as string) as label",
            "truncateFeatures(features) as features")
        label_indexer = StringIndexer(inputCol="label",
                                      outputCol="indexedLabel")
        feature_indexer = VectorIndexer(inputCol="features",
                                        outputCol="indexedFeatures",
                                        maxCategories=10,
                                        handleInvalid='error')

        rf = RandomForestRegressor(labelCol="indexedLabel",
                                   featuresCol="indexedFeatures",
                                   numTrees=10)
        pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf])
        model = pipeline.fit(data)
        model_onnx = convert_sparkml(
            model,
            'Sparkml RandomForest Regressor',
            [('label', StringTensorType([1, 1])),
             ('features', FloatTensorType([1, feature_count]))],
            spark_session=self.spark)
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data.limit(1))
        data_np = {
            'label':
            data.limit(1).toPandas().label.values,
            'features':
            data.limit(1).toPandas().features.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        }
        expected = [
            predicted.toPandas().indexedLabel.values.astype(numpy.int64),
            predicted.toPandas().prediction.values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlRandomForestRegressor")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'],
                                               data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #8
0
    def test_model_pipeline_4_stage(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "AdultCensusIncomeOriginal.csv")
        full_data = self.spark.read.format('csv')\
            .options(header='true', inferschema='true').load(input_path)
        cols = ['workclass', 'education', 'marital_status']
        training_data, test_data = full_data.select('income', *cols).limit(1000).randomSplit([0.9, 0.1],seed=1)

        stages = []
        for col in cols:
            stages.append(StringIndexer(inputCol=col, outputCol=col+'_index', handleInvalid='skip'))
            stages.append(OneHotEncoderEstimator(inputCols=[col+'_index'], outputCols=[col+'_vec'], dropLast=False))

        stages.append(VectorAssembler(inputCols=[c+'_vec' for c in cols], outputCol='features'))
        stages.append(StringIndexer(inputCol='income', outputCol='label', handleInvalid='skip'))
        stages.append(LogisticRegression(maxIter=100, tol=0.0001))
        pipeline = Pipeline(stages=stages)

        model = pipeline.fit(training_data)
        model_onnx = convert_sparkml(model, 'Sparkml Pipeline', [
            ('income', StringTensorType([1, 1])),
            ('workclass', StringTensorType([1, 1])),
            ('education', StringTensorType([1, 1])),
            ('marital_status', StringTensorType([1, 1]))
        ])
        self.assertTrue(model_onnx is not None)
        self.assertTrue(model_onnx.graph.node is not None)
        # run the model
        predicted = model.transform(test_data)
        data_np = {
            'income': test_data.select('income').toPandas().values,
            'workclass': test_data.select('workclass').toPandas().values,
            'education': test_data.select('education').toPandas().values,
            'marital_status': test_data.select('marital_status').toPandas().values
        }
        expected = [
            predicted.toPandas().label.values.astype(numpy.float32),
            predicted.toPandas().prediction.values.astype(numpy.float32),
            predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                basename="SparkmlPipeline_4Stage")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['label', 'prediction', 'probability'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #9
0
 def test_one_hot_encoder_mixed_string_int(self):
     # categorical_features will be removed in 0.22 (this test will fail by then).
     data = [["0.4", "0.2", 3], ["1.4", "1.2", 0], ["0.2", "2.2", 1]]
     model = OneHotEncoder(categories='auto')        
     model.fit(data)
     inputs = [('input1', StringTensorType([1, 2])), ('input2', Int64TensorType([1, 1]))]
     model_onnx = convert_sklearn(model, 'one-hot encoder mixed-type inputs', inputs)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(data, model, model_onnx, basename="SklearnOneHotEncoderStringInt64",
                         allow_failure=True)
コード例 #10
0
 def test_model_label_encoder(self):
     model = LabelEncoder()
     data = ['str3', 'str2', 'str0', 'str1', 'str3']
     model.fit(data)
     model_onnx = convert_sklearn(model, 'scikit-learn label encoder',
                                  [('input', StringTensorType([1, 1]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     dump_data_and_model(numpy.array(data),
                         model,
                         model_onnx,
                         basename="SklearnLabelEncoder")
コード例 #11
0
 def test_model_tfidf_vectorizer13(self):
     corpus = [
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.',
         'Is this the first document?',
     ]
     vect = TfidfVectorizer(ngram_range=(1, 3))
     vect.fit(corpus)
     pred = vect.transform(corpus)
     model_onnx = convert_sklearn(vect, 'scikit-learn count vectorizer',
                                  [('input', StringTensorType([1, 1]))])
     self.assertTrue(model_onnx is not None)
コード例 #12
0
 def test_model_string_indexer(self):
     indexer = StringIndexer(inputCol='cat1', outputCol='cat1_index', handleInvalid='skip')
     data = self.spark.createDataFrame([("a",), ("b",), ("c",), ("a",), ("a",), ("c",)], ['cat1'])
     model = indexer.fit(data)
     # the input name should match that of what StringIndexer.inputCol
     model_onnx = convert_sparkml(model, 'Sparkml StringIndexer', [('cat1', StringTensorType([1, 1]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     predicted = model.transform(data)
     predicted_np = predicted.select("cat1_index").toPandas().values
     data_np = data.select('cat1').toPandas().values
     dump_data_and_sparkml_model(data_np, predicted_np, model, model_onnx,
                                 basename="SparkmlStringIndexer")
コード例 #13
0
def getTensorTypeFromSpark(sparktype):
    if sparktype == 'StringType':
        return StringTensorType([1, 1])
    elif sparktype == 'DecimalType' \
            or sparktype == 'DoubleType' \
            or sparktype == 'FloatType' \
            or sparktype == 'LongType' \
            or sparktype == 'IntegerType' \
            or sparktype == 'ShortType' \
            or sparktype == 'ByteType' \
            or sparktype == 'BooleanType':
        return FloatTensorType([1, 1])
    else:
        raise TypeError("Cannot map this type to Onnx types: " + sparktype)
コード例 #14
0
    def test_tokenizer(self):
        data = self.spark.createDataFrame([("a b c",)], ["text"])
        model = Tokenizer(inputCol='text', outputCol='words')
        predicted = model.transform(data)

        model_onnx = convert_sparkml(model, 'Sparkml Tokenizer', [
            ('text', StringTensorType([None]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        expected = predicted.toPandas().words.apply(pandas.Series).values
        data_np = data.toPandas().text.values.reshape([-1])
        paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlTokenizer")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['words'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #15
0
 def test_model_dict_vectorizer(self):
     model = DictVectorizer()
     data = [{'amy': 1., 'chin': 200.}, {'nice': 3., 'amy': 1.}]
     model.fit_transform(data)
     model_onnx = convert_sklearn(
         model, 'dictionary vectorizer',
         [('input',
           DictionaryType(StringTensorType([1]), FloatTensorType([1])))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         data,
         model,
         model_onnx,
         basename="SklearnDictVectorizer-OneOff-SkipDim1",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.3') or StrictVersion(onnx.__version__) < StrictVersion('1.3.0')"
     )
コード例 #16
0
    def test_word2vec(self):
        data = self.spark.createDataFrame(
            [("Hi I heard about Spark".split(" "), ),
             ("I wish Java could use case classes".split(" "), ),
             ("Logistic regression models are neat".split(" "), )], ["text"])
        word2Vec = Word2Vec(vectorSize=3,
                            minCount=0,
                            inputCol="text",
                            outputCol="result")
        model = word2Vec.fit(data)
        vectors = model.getVectors()
        vectors.show(100, False)

        result = model.transform(data)
        result.show(100, False)

        # the input name should match that of inputCol
        feature_count = len(data.first()[0])
        model_onnx = convert_sparkml(
            model, 'Sparkml Word2Vec',
            [('text', StringTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.transform(data.limit(1))
        expected = predicted.toPandas().result.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.limit(1).toPandas().text.values
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlWord2Vec")
        onnx_model_path = paths[-1]
        data_np = numpy.array(data_np[0]).reshape((1, -1))
        output, output_shapes = run_onnx_model(['result'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
コード例 #17
0
def calculate_sparkml_stop_words_remover_output_shapes(operator):
    check_input_and_output_numbers(operator, output_count_range=1)
    check_input_and_output_types(operator, good_input_types=[StringTensorType])
    input_shape = copy.deepcopy(operator.inputs[0].type.shape)
    operator.outputs[0].type = StringTensorType(input_shape)
コード例 #18
0
def calculate_sparkml_tokenizer_output_shapes(operator):
    check_input_and_output_numbers(operator, output_count_range=1)
    check_input_and_output_types(operator,
                                 good_input_types=[StringTensorType])
    operator.outputs[0].type = StringTensorType()