Python Pipeline.transform Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.pipeline

Class/Type: Pipeline

Method/Function: transform

Examples at hotexamples.com: 4

Python Pipeline.transform - 4 examples found. These are the top rated real world Python examples of pyspark.ml.pipeline.Pipeline.transform extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Pipeline(30)

fit(30)

write(7)

save(5)

transform(4)

load(3)

setStages(3)

explainParams(2)

getStages(1)

Example #1

Show file

def preprocess(data, data_all, num_ints, num_cats, encoder_pipeline=None):
    # Index category strings
    string_indexers = [StringIndexer(inputCol=str(col), outputCol='%d_idx' % col)
                       for col in range(1+num_ints, 1+num_ints+num_cats)]

    # One hot encode category features
    encoders = [OneHotEncoder(dropLast=True, inputCol='%d_idx' % col, outputCol='%d_cat' % col)
                for col in range(1+num_ints, 1+num_ints+num_cats)]

    # Build and fit pipeline
    if not encoder_pipeline:
        encoder_pipeline = Pipeline(stages=string_indexers + encoders).fit(data_all)
        
    results = encoder_pipeline.transform(data)
    return results, encoder_pipeline

Example #2

Show file

 def test_save_pipeline(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     p = Pipeline(stages=[spark_model]).fit(processed)
     p.write().overwrite().save('example_pipeline')
     p = PysparkPipelineWrapper.unwrap(
         PipelineModel.load('example_pipeline'))
     data = p.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))

Example #3

Show file

File: kubernetes-spark-example.py Project: nethrekolli/Processes-and-Tools-for-Scalable-ML-Lifecycle

    encoder = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='out:0',
                               tfOptimizer='adam',
                               miniBatchSize=300,
                               miniStochasticIters=1,
                               shufflePerIter=True,
                               iters=50,
                               predictionCol='predicted',
                               labelCol='labels',
                               partitions=4,
                               verbose=1,
                               optimizerOptions=adam_config)

    # Create and save the Pipeline
    p = Pipeline(stages=[vector_assembler, encoder, spark_model]).fit(df)

    test_df = spark.read.option("inferSchema", "true").csv(
        'gs://kubernetes-spark-project-bucket/mnist_test.csv').orderBy(rand())

    # Run predictions and evaluation
    predictions = p.transform(test_df)
    evaluator = MulticlassClassificationEvaluator(labelCol="_c0",
                                                  predictionCol="predicted",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (accuracy))

Example #4

Show file

    )

    # Setup features
    vector_assembler = VectorAssembler(inputCols=df.columns[1:785], outputCol='features')

    # Demonstration of some options. Not all are required
    # Note: This uses the barrier execution mode, which is sensitive to the number of partitions
    spark_model = SparkTorch(
        inputCol='features',
        labelCol='_c0',
        predictionCol='predictions',
        torchObj=torch_obj,
        iters=100,
        verbose=1,
        miniBatch=128,
        earlyStopPatience=20,
        validationPct=0.2
    )

    # Create and save the Pipeline
    p = Pipeline(stages=[vector_assembler, spark_model]).fit(df)

    # Run predictions and evaluation
    predictions = p.transform(df).persist()
    evaluator = MulticlassClassificationEvaluator(
        labelCol="_c0", predictionCol="predictions", metricName="accuracy")

    accuracy = evaluator.evaluate(predictions)
    print("Train accuracy = %g" % accuracy)