def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage,
                       data):
    ## Remove all helper columns
    colPruner = ColumnPruner(columns=[
        idf.getOutputCol(),
        hashingTF.getOutputCol(),
        stopWordsRemover.getOutputCol(),
        tokenizer.getOutputCol()
    ])

    ## Create the pipeline by defining all the stages
    pipeline = Pipeline(stages=[
        tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner
    ])

    ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline
    ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home
    ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow.
    pipelinePath = "file://" + os.path.abspath("../build/pipeline")
    pipeline.write().overwrite().save(pipelinePath)
    loaded_pipeline = Pipeline.load(pipelinePath)

    ## Train the pipeline model
    modelPath = "file://" + os.path.abspath("../build/model")
    model = loaded_pipeline.fit(data)
    model.write().overwrite().save(modelPath)
    return PipelineModel.load(modelPath)
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

## Create H2OAutoML model
automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False,
                   seed=1,
                   maxRuntimeSecs=300, # 5 minutes
                   predictionCol="label")

## Remove all helper columns
colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()])

## Create the pipeline by defining all the stages
pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, automl, colPruner])

## Train the pipeline model
data = load()
model = pipeline.fit(data)

##
## Make predictions on unlabeled data
## Spam detector
##
def isSpam(smsText, model, hamThreshold = 0.5):
    smsTextDF = spark.createDataFrame([(smsText,)], ["text"]) # create one element tuple
    prediction = model.transform(smsTextDF)