Example #1
0
    def test_pipeline(self, bag):
        from pyspark.ml.pipeline import Pipeline
        # create and save and load
        pth = "/tmp/spatial-join"
        new_p = Pipeline().setStages([bag["transformer"]])
        new_p.write().overwrite().save(pth)
        saved_p = Pipeline.load(pth)

        # check transformations
        inp = bag["input"]
        exp = bag["expected"]
        check(new_p.fit(inp), inp, exp)
        check(saved_p.fit(inp), inp, exp)
Example #2
0
 def test_save_pipeline(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     p = Pipeline(stages=[spark_model]).fit(processed)
     p.write().overwrite().save('example_pipeline')
     p = PysparkPipelineWrapper.unwrap(
         PipelineModel.load('example_pipeline'))
     data = p.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))
Example #3
0
    mg = build_graph(small_model)
    #Assemble and one hot encode
    va = VectorAssembler(inputCols=final_df.columns[1:151],
                         outputCol='features')
    encoded = OneHotEncoder(inputCol='result',
                            outputCol='labels',
                            dropLast=False)
    adam_config = build_adam_config(learning_rate=0.001,
                                    beta1=0.9,
                                    beta2=0.999)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='out:0',
                               tfLearningRate=.001,
                               iters=20,
                               predictionCol='predicted',
                               labelCol='labels',
                               verbose=1,
                               optimizerOptions=adam_config)

    ckptpath = os.path.join(ckptdir, task)
    print('save model in ckptpath')
    p = Pipeline(stages=[va, encoded, spark_model]).fit(final_df)
    p.write().overwrite().save(ckptpath)

    print('===task all done===')
Example #4
0
                                       outputCol='features')

    # Demonstration of some options. Not all are required
    # Note: This uses the barrier execution mode, which is sensitive to the number of partitions
    spark_model = SparkTorch(inputCol='features',
                             labelCol='_c0',
                             predictionCol='predictions',
                             torchObj=torch_obj,
                             iters=50,
                             verbose=1,
                             validationPct=0.2,
                             miniBatch=128)

    # Create and save the Pipeline
    p = Pipeline(stages=[vector_assembler, spark_model]).fit(df)
    p.write().overwrite().save('simple_cnn')

    # Example of loading the pipeline
    loaded_pipeline = PysparkPipelineWrapper.unwrap(
        PipelineModel.load('simple_cnn'))

    # Run predictions and evaluation
    predictions = loaded_pipeline.transform(df).persist()

    evaluator = MulticlassClassificationEvaluator(labelCol="_c0",
                                                  predictionCol="predictions",
                                                  metricName="accuracy")

    accuracy = evaluator.evaluate(predictions)
    print("Train accuracy = %g" % accuracy)
        df = df.withColumn(f, df[f].cast('int'))
    if f == 'class':
        df = df.withColumn(f, df[f].cast('string'))
df = df.dropna()

train, test = df.randomSplit([0.8, 0.2], seed=0)

class_index = StringIndexer(inputCol='class', outputCol='label')
vector = VectorAssembler(inputCols=feature_cols, outputCol='feature')
model = LinearSVC(featuresCol='feature', labelCol='label')
pipeline = Pipeline(stages=[class_index, vector, model])

pipeline = pipeline.fit(train)
if os.path.exists(MODEL_SAVE_PATH):
    shutil.rmtree(MODEL_SAVE_PATH)
pipeline.write().overwrite().save(pipeline)  # pipeline.save('/to/path')

load_pipeline = PipelineModel.load('pipeline')
test_predict = load_pipeline.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                          labelCol='label')

print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderPR'}))

origin_test_df = df.select(feature_cols)

predict_df = load_pipeline.transform(origin_test_df)
print(predict_df.show(20))
def small_model():
    x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
    y = tf.placeholder(tf.float32, shape=[None, 10], name='y')
    layer1 = tf.layers.dense(x, 256, activation=tf.nn.relu)
    layer2 = tf.layers.dense(layer1, 256, activation=tf.nn.relu)
    out = tf.layers.dense(layer2, 10)
    z = tf.argmax(out, 1, name='out')
    loss = tf.losses.softmax_cross_entropy(y, out)
    return loss


df = spark.read.option("inferSchema", "true").csv('mnist_train.csv')
mg = build_graph(small_model)
# Assemble and one hot encode
va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features')
encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False)

spark_model = SparkAsyncDL(inputCol='features',
                           tensorflowGraph=mg,
                           tfInput='x:0',
                           tfLabel='y:0',
                           tfOutput='out:0',
                           tfLearningRate=.001,
                           iters=20,
                           predictionCol='predicted',
                           labelCol='labels',
                           verbose=1)

p = Pipeline(stages=[va, encoded, spark_model]).fit(df)
p.write().overwrite().save("location")
    tfOutput='out:0',
    tfLearningRate=.001,
    iters=10,
    predictionCol='predicted',
    labelCol='labels',
    verbose=1)



if __name__ == "__main__":

    #Pipeline definition
    #pipe = [va, encoded, spark_model]

    #Train the model
    try:
        import time
        start_time = time.time()
        p = Pipeline(stages=[va, encoded, spark_model]).fit(df)
        print("--- %s seconds ---" % (time.time() - start_time))

        p.write().save("dnn_model")
    except Exception as e:
        print ("Error --> ", e)

    #Save the model
    #p.write().overwrite().save("dnn_model")


    #exec(open("dnn_model.py").read())