def test_pipeline(self, bag): from pyspark.ml.pipeline import Pipeline # create and save and load pth = "/tmp/spatial-join" new_p = Pipeline().setStages([bag["transformer"]]) new_p.write().overwrite().save(pth) saved_p = Pipeline.load(pth) # check transformations inp = bag["input"] exp = bag["expected"] check(new_p.fit(inp), inp, exp) check(saved_p.fit(inp), inp, exp)
def test_save_pipeline(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label') p = Pipeline(stages=[spark_model]).fit(processed) p.write().overwrite().save('example_pipeline') p = PysparkPipelineWrapper.unwrap( PipelineModel.load('example_pipeline')) data = p.transform(processed).take(10) nb_errors = SparkFlowTests.calculate_errors(data) self.assertTrue(nb_errors < len(data))
mg = build_graph(small_model) #Assemble and one hot encode va = VectorAssembler(inputCols=final_df.columns[1:151], outputCol='features') encoded = OneHotEncoder(inputCol='result', outputCol='labels', dropLast=False) adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfLearningRate=.001, iters=20, predictionCol='predicted', labelCol='labels', verbose=1, optimizerOptions=adam_config) ckptpath = os.path.join(ckptdir, task) print('save model in ckptpath') p = Pipeline(stages=[va, encoded, spark_model]).fit(final_df) p.write().overwrite().save(ckptpath) print('===task all done===')
outputCol='features') # Demonstration of some options. Not all are required # Note: This uses the barrier execution mode, which is sensitive to the number of partitions spark_model = SparkTorch(inputCol='features', labelCol='_c0', predictionCol='predictions', torchObj=torch_obj, iters=50, verbose=1, validationPct=0.2, miniBatch=128) # Create and save the Pipeline p = Pipeline(stages=[vector_assembler, spark_model]).fit(df) p.write().overwrite().save('simple_cnn') # Example of loading the pipeline loaded_pipeline = PysparkPipelineWrapper.unwrap( PipelineModel.load('simple_cnn')) # Run predictions and evaluation predictions = loaded_pipeline.transform(df).persist() evaluator = MulticlassClassificationEvaluator(labelCol="_c0", predictionCol="predictions", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Train accuracy = %g" % accuracy)
df = df.withColumn(f, df[f].cast('int')) if f == 'class': df = df.withColumn(f, df[f].cast('string')) df = df.dropna() train, test = df.randomSplit([0.8, 0.2], seed=0) class_index = StringIndexer(inputCol='class', outputCol='label') vector = VectorAssembler(inputCols=feature_cols, outputCol='feature') model = LinearSVC(featuresCol='feature', labelCol='label') pipeline = Pipeline(stages=[class_index, vector, model]) pipeline = pipeline.fit(train) if os.path.exists(MODEL_SAVE_PATH): shutil.rmtree(MODEL_SAVE_PATH) pipeline.write().overwrite().save(pipeline) # pipeline.save('/to/path') load_pipeline = PipelineModel.load('pipeline') test_predict = load_pipeline.transform(test) evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='label') print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderPR'})) origin_test_df = df.select(feature_cols) predict_df = load_pipeline.transform(origin_test_df) print(predict_df.show(20))
def small_model(): x = tf.placeholder(tf.float32, shape=[None, 784], name='x') y = tf.placeholder(tf.float32, shape=[None, 10], name='y') layer1 = tf.layers.dense(x, 256, activation=tf.nn.relu) layer2 = tf.layers.dense(layer1, 256, activation=tf.nn.relu) out = tf.layers.dense(layer2, 10) z = tf.argmax(out, 1, name='out') loss = tf.losses.softmax_cross_entropy(y, out) return loss df = spark.read.option("inferSchema", "true").csv('mnist_train.csv') mg = build_graph(small_model) # Assemble and one hot encode va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features') encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfLearningRate=.001, iters=20, predictionCol='predicted', labelCol='labels', verbose=1) p = Pipeline(stages=[va, encoded, spark_model]).fit(df) p.write().overwrite().save("location")
tfOutput='out:0', tfLearningRate=.001, iters=10, predictionCol='predicted', labelCol='labels', verbose=1) if __name__ == "__main__": #Pipeline definition #pipe = [va, encoded, spark_model] #Train the model try: import time start_time = time.time() p = Pipeline(stages=[va, encoded, spark_model]).fit(df) print("--- %s seconds ---" % (time.time() - start_time)) p.write().save("dnn_model") except Exception as e: print ("Error --> ", e) #Save the model #p.write().overwrite().save("dnn_model") #exec(open("dnn_model.py").read())