def preprocess(data, data_all, num_ints, num_cats, encoder_pipeline=None): # Index category strings string_indexers = [StringIndexer(inputCol=str(col), outputCol='%d_idx' % col) for col in range(1+num_ints, 1+num_ints+num_cats)] # One hot encode category features encoders = [OneHotEncoder(dropLast=True, inputCol='%d_idx' % col, outputCol='%d_cat' % col) for col in range(1+num_ints, 1+num_ints+num_cats)] # Build and fit pipeline if not encoder_pipeline: encoder_pipeline = Pipeline(stages=string_indexers + encoders).fit(data_all) results = encoder_pipeline.transform(data) return results, encoder_pipeline
def test_save_pipeline(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label') p = Pipeline(stages=[spark_model]).fit(processed) p.write().overwrite().save('example_pipeline') p = PysparkPipelineWrapper.unwrap( PipelineModel.load('example_pipeline')) data = p.transform(processed).take(10) nb_errors = SparkFlowTests.calculate_errors(data) self.assertTrue(nb_errors < len(data))
encoder = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfOptimizer='adam', miniBatchSize=300, miniStochasticIters=1, shufflePerIter=True, iters=50, predictionCol='predicted', labelCol='labels', partitions=4, verbose=1, optimizerOptions=adam_config) # Create and save the Pipeline p = Pipeline(stages=[vector_assembler, encoder, spark_model]).fit(df) test_df = spark.read.option("inferSchema", "true").csv( 'gs://kubernetes-spark-project-bucket/mnist_test.csv').orderBy(rand()) # Run predictions and evaluation predictions = p.transform(test_df) evaluator = MulticlassClassificationEvaluator(labelCol="_c0", predictionCol="predicted", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (accuracy))
) # Setup features vector_assembler = VectorAssembler(inputCols=df.columns[1:785], outputCol='features') # Demonstration of some options. Not all are required # Note: This uses the barrier execution mode, which is sensitive to the number of partitions spark_model = SparkTorch( inputCol='features', labelCol='_c0', predictionCol='predictions', torchObj=torch_obj, iters=100, verbose=1, miniBatch=128, earlyStopPatience=20, validationPct=0.2 ) # Create and save the Pipeline p = Pipeline(stages=[vector_assembler, spark_model]).fit(df) # Run predictions and evaluation predictions = p.transform(df).persist() evaluator = MulticlassClassificationEvaluator( labelCol="_c0", predictionCol="predictions", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Train accuracy = %g" % accuracy)