def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_python_transformer_pipeline_persistence(self): """ Pipeline[MockUnaryTransformer, Binarizer] """ temp_path = tempfile.mkdtemp() try: df = self.spark.range(0, 10).toDF("input") tf = MockUnaryTransformer( shiftVal=2).setInputCol("input").setOutputCol("shiftedInput") tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") pl = Pipeline(stages=[tf, tf2]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_python_transformer_pipeline_persistence(self): """ Pipeline[MockUnaryTransformer, Binarizer] """ temp_path = tempfile.mkdtemp() try: df = self.spark.range(0, 10).toDF('input') tf = MockUnaryTransformer(shiftVal=2)\ .setInputCol("input").setOutputCol("shiftedInput") tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") pl = Pipeline(stages=[tf, tf2]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
if __name__ == '__main__': spark = SparkSession.builder \ .appName("examples") \ .master('local[8]').config('spark.driver.memory', '2g') \ .getOrCreate() df = spark.read.option( "inferSchema", "true").csv('examples/mnist_train.csv').orderBy(rand()) mg = build_graph(cnn_model) va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features') encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOptimizer='adam', miniBatchSize=300, miniStochasticIters=-1, shufflePerIter=True, iters=20, tfLearningRate=.0001, predictionCol='predicted', labelCol='labels', verbose=1) p = Pipeline(stages=[va, encoded, spark_model]).fit(df) p.save("cnn")
network = UNet(1) torch_obj = serialize_torch_obj( model = network, criterion=soft_dice_loss, optimizer=torch.optim.Adam, lr=0.0001 ) spark_model = SparkTorch( inputCol='features', labelCol='labels', predictionCol='predictions', torchObj=torch_obj, iters=10, verbose=1 ) print("Ran successfully") data_train = spark.read.option("inferSchema","true").option("maxColumns",64*64*4).csv(data_train_path) features_size = 64*64*3 va1 = VectorAssembler(inputCols=data_train.columns[:features_size], outputCol='features') va2 = VectorAssembler(inputCols=data_train.columns[features_size:], outputCol='labels') p = Pipeline(stages=[va1, va2, spark_model]).fit(data_train) p.save('unet')
tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfOptimizer='adam', miniBatchSize=300, miniStochasticIters=1, shufflePerIter=True, iters=50, predictionCol='predicted', labelCol='labels', partitions=4, verbose=1, optimizerOptions=adam_config) # Create and save the Pipeline p = Pipeline(stages=[vector_assembler, encoder, spark_model]).fit(df) p.save('simple_dnn') # Example of loading the pipeline loaded_pipeline = PysparkPipelineWrapper.unwrap( PipelineModel.load('simple_dnn')) # Run predictions and evaluation predictions = loaded_pipeline.transform(df) evaluator = MulticlassClassificationEvaluator(labelCol="_c0", predictionCol="predicted", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))