def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage, data): ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[ tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner ]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipelinePath = "file://" + os.path.abspath("../build/pipeline") pipeline.write().overwrite().save(pipelinePath) loaded_pipeline = Pipeline.load(pipelinePath) ## Train the pipeline model modelPath = "file://" + os.path.abspath("../build/model") model = loaded_pipeline.fit(data) model.write().overwrite().save(modelPath) return PipelineModel.load(modelPath)
outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2OAutoML model automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False, seed=1, maxRuntimeSecs=300, # 5 minutes predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, automl, colPruner]) ## Train the pipeline model data = load() model = pipeline.fit(data) ## ## Make predictions on unlabeled data ## Spam detector ## def isSpam(smsText, model, hamThreshold = 0.5): smsTextDF = spark.createDataFrame([(smsText,)], ["text"]) # create one element tuple prediction = model.transform(smsTextDF)