Example #1
0
def preprocess(data, data_all, num_ints, num_cats, encoder_pipeline=None):
    # Index category strings
    string_indexers = [StringIndexer(inputCol=str(col), outputCol='%d_idx' % col)
                       for col in range(1+num_ints, 1+num_ints+num_cats)]

    # One hot encode category features
    encoders = [OneHotEncoder(dropLast=True, inputCol='%d_idx' % col, outputCol='%d_cat' % col)
                for col in range(1+num_ints, 1+num_ints+num_cats)]

    # Build and fit pipeline
    if not encoder_pipeline:
        encoder_pipeline = Pipeline(stages=string_indexers + encoders).fit(data_all)
        
    results = encoder_pipeline.transform(data)
    return results, encoder_pipeline
Example #2
0
 def test_save_pipeline(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     p = Pipeline(stages=[spark_model]).fit(processed)
     p.write().overwrite().save('example_pipeline')
     p = PysparkPipelineWrapper.unwrap(
         PipelineModel.load('example_pipeline'))
     data = p.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))
    encoder = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='out:0',
                               tfOptimizer='adam',
                               miniBatchSize=300,
                               miniStochasticIters=1,
                               shufflePerIter=True,
                               iters=50,
                               predictionCol='predicted',
                               labelCol='labels',
                               partitions=4,
                               verbose=1,
                               optimizerOptions=adam_config)

    # Create and save the Pipeline
    p = Pipeline(stages=[vector_assembler, encoder, spark_model]).fit(df)

    test_df = spark.read.option("inferSchema", "true").csv(
        'gs://kubernetes-spark-project-bucket/mnist_test.csv').orderBy(rand())

    # Run predictions and evaluation
    predictions = p.transform(test_df)
    evaluator = MulticlassClassificationEvaluator(labelCol="_c0",
                                                  predictionCol="predicted",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (accuracy))
Example #4
0
    )

    # Setup features
    vector_assembler = VectorAssembler(inputCols=df.columns[1:785], outputCol='features')

    # Demonstration of some options. Not all are required
    # Note: This uses the barrier execution mode, which is sensitive to the number of partitions
    spark_model = SparkTorch(
        inputCol='features',
        labelCol='_c0',
        predictionCol='predictions',
        torchObj=torch_obj,
        iters=100,
        verbose=1,
        miniBatch=128,
        earlyStopPatience=20,
        validationPct=0.2
    )

    # Create and save the Pipeline
    p = Pipeline(stages=[vector_assembler, spark_model]).fit(df)

    # Run predictions and evaluation
    predictions = p.transform(df).persist()
    evaluator = MulticlassClassificationEvaluator(
        labelCol="_c0", predictionCol="predictions", metricName="accuracy")

    accuracy = evaluator.evaluate(predictions)
    print("Train accuracy = %g" % accuracy)