def test_python_transformer_pipeline_persistence(self): """ Pipeline[MockUnaryTransformer, Binarizer] """ temp_path = tempfile.mkdtemp() try: df = self.spark.range(0, 10).toDF("input") tf = MockUnaryTransformer( shiftVal=2).setInputCol("input").setOutputCol("shiftedInput") tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") pl = Pipeline(stages=[tf, tf2]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_python_transformer_pipeline_persistence(self): """ Pipeline[MockUnaryTransformer, Binarizer] """ temp_path = tempfile.mkdtemp() try: df = self.spark.range(0, 10).toDF('input') tf = MockUnaryTransformer(shiftVal=2)\ .setInputCol("input").setOutputCol("shiftedInput") tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") pl = Pipeline(stages=[tf, tf2]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_pipeline(self, bag): from pyspark.ml.pipeline import Pipeline # create and save and load pth = "/tmp/spatial-join" new_p = Pipeline().setStages([bag["transformer"]]) new_p.write().overwrite().save(pth) saved_p = Pipeline.load(pth) # check transformations inp = bag["input"] exp = bag["expected"] check(new_p.fit(inp), inp, exp) check(saved_p.fit(inp), inp, exp)