def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=['R','G','B'], outputCol='features') word_indexer = StringIndexer(inputCol='word', outputCol='label') classifier = MultilayerPerceptronClassifier(layers=[3, 30, 11]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation data predictions = rgb_model.transform(validation) rgb_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1') score = rgb_evaluator.evaluate(predictions) plot_predictions(rgb_model, 'RGB', labelCol='word') print('Validation score for RGB model: %g' % (score, )) rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate. lab = SQLTransformer(statement=rgb_to_lab_query) lab_assembler = VectorAssembler(inputCols=['labL','labA','labB'], outputCol='features') lab_pipeline = Pipeline(stages=[lab, lab_assembler, word_indexer, classifier]) lab_model = lab_pipeline.fit(train) plot_predictions(lab_model, 'LAB', labelCol='word') lab_predictions = lab_model.transform(validation) lab_score = rgb_evaluator.evaluate(lab_predictions) print('Validation score for LAB model:', lab_score)
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() word_indexer = StringIndexer(inputCol = "word", outputCol = "labelCol", handleInvalid = 'error') classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol") # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols = ['R', 'G', 'B'], outputCol = "features") classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol") rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation data evaluator = MulticlassClassificationEvaluator(labelCol = "labelCol" , predictionCol = "prediction") predictions = rgb_model.transform(validation) score = evaluator.evaluate(predictions) plot_predictions(rgb_model, 'RGB', labelCol='word') print('Validation score for RGB model: %g' % (score, )) rgb_to_lab_query = rgb2lab_query(passthrough_columns=["word"]) sqlTrans = SQLTransformer(statement = rgb_to_lab_query) # TODO: create a pipeline to predict RGB colours -> word; train and evaluate. lab_assembler = VectorAssembler(inputCols = ['labL', 'labA', 'labB'], outputCol = "features") lab_pipeline = Pipeline(stages=[sqlTrans,lab_assembler, word_indexer, classifier]) lab_model = lab_pipeline.fit(train) predictions_lab = lab_model.transform(validation) score_lab = evaluator.evaluate(predictions_lab) plot_predictions(lab_model, 'LAB', labelCol='word') print('Validation score for LAB model: %g' % (score_lab, ))
def main(inputs): data = spark.read.csv(inputs, header=True, schema=colour_schema) lab_query = rgb2lab_query(passthrough_columns=['labelword']) # TODO: actually build the components for the pipelines, and the pipelines. indexer = StringIndexer(inputCol="labelword", outputCol="labelCol", handleInvalid='error') rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol="features") lab_assembler = VectorAssembler(inputCols=['lL', 'lA', 'lB'], outputCol="features") forest = RandomForestClassifier(numTrees=22, maxDepth=10, labelCol="labelCol", seed=42) mlp = MultilayerPerceptronClassifier(maxIter=400, layers=[3, 16, 11], blockSize=1, seed=123, labelCol="labelCol") sqlTrans = SQLTransformer(statement=lab_query) models = [ ('RGB-forest', Pipeline(stages=[indexer, rgb_assembler, forest])), ('LAB-forest', Pipeline(stages=[sqlTrans, indexer, lab_assembler, forest])), ('RGB-MLP', Pipeline(stages=[indexer, rgb_assembler, mlp])), ('LAB-MLP', Pipeline(stages=[sqlTrans, indexer, lab_assembler, mlp])), ] # TODO: need an evaluator evaluator = MulticlassClassificationEvaluator(labelCol="labelCol", predictionCol="prediction") # TODO: split data into training and testing train, test = data.randomSplit([0.75, 0.25]) train = train.cache() test = test.cache() score_dict = dict() for label, pipeline in models: # TODO: fit the pipeline to create a model model = pipeline.fit(train) # Output a visual representation of the predictions we're # making: uncomment when you have a model working plot_predictions(model, label) # TODO: predict on the test data predictions = model.transform(test) # calculate a score score = evaluator.evaluate(predictions) score_dict[label] = score return score_dict
def main(inputs): # Read the CSV File df = spark.read.csv(inputs, schema=colour_schema) # Total label count label_num = df.select('word').distinct().count() # Split the dataset. Make 75% as training set and the remaining 25% as validation set train, validation = df.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # Creating pipeline rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features") word_indexer = StringIndexer(inputCol="word", outputCol="label", handleInvalid="error") classifier_mpc = MultilayerPerceptronClassifier(layers=[3, 250, label_num]) # Transformer for the lab pipeline rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sqlTrans = SQLTransformer(statement=rgb_to_lab_query) lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"], outputCol="features") # TODO: create a pipeline to predict RGB colours -> word; train and evaluate. # pipeline to predict RGB colours rgb_pipeline = Pipeline( stages=[rgb_assembler, word_indexer, classifier_mpc]) lab_pipeline = Pipeline( stages=[sqlTrans, lab_assembler, word_indexer, classifier_mpc]) # Train the model rgb_model = rgb_pipeline.fit(train) lab_model = lab_pipeline.fit(train) # Transform the validation set predictions_rgb = rgb_model.transform(validation) predictions_lab = lab_model.transform(validation) # TODO: create an evaluator and score the validation data # Create a Multiclass Classification Evaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") # Evaluate it on validation data score_rgb = evaluator.evaluate(predictions_rgb) score_lab = evaluator.evaluate(predictions_lab) plot_predictions(rgb_model, 'RGB', labelCol='word') plot_predictions(lab_model, 'LAB', labelCol='word') # Print the validation scores print('Validation score for RGB model: %g' % (score_rgb, )) print('Validation score for LAB model: %g' % (score_lab, ))
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) #use seed here train = train.cache() validation = validation.cache() #creating a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol="features") #dataframe1 = rgb_assembler.transform(data) word_indexer = StringIndexer(inputCol="word", outputCol="target", handleInvalid="error", stringOrderType="frequencyDesc") classifier = MultilayerPerceptronClassifier(featuresCol="features", labelCol="target", layers=[3, 25, 25]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) #creating an evaluator and score the validation data #model_train = rgb_model.transform(train) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="target") rgb_validation = rgb_model.transform(validation) score = evaluator.evaluate(rgb_validation, {evaluator.metricName: "accuracy"}) print('Validation score for RGB model: %g' % (score, )) plot_predictions(rgb_model, 'RGB', labelCol='target') rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) # creating a pipeline to predict RGB colours -> word; train and evaluate. sqlTrans = SQLTransformer(statement=rgb_to_lab_query) labdata = sqlTrans.transform(data) ltrain, lvalidation = labdata.randomSplit([0.75, 0.25]) lrgb_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol="LAB") lword_indexer = StringIndexer(inputCol="word", outputCol="labTarget", handleInvalid="error", stringOrderType="frequencyDesc") lclassifier = MultilayerPerceptronClassifier(featuresCol="LAB", labelCol="labTarget", layers=[3, 25, 25]) lrgb_pipeline = Pipeline( stages=[sqlTrans, lrgb_assembler, lword_indexer, lclassifier]) lrgb_model = lrgb_pipeline.fit(ltrain) #lmodel_train = lrgb_model.transform(ltrain) lrgb_validation = lrgb_model.transform(lvalidation) print(lrgb_validation.show()) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="labTarget") lscore = evaluator.evaluate(lrgb_validation, {evaluator.metricName: "accuracy"}) print('Validation score for LAB model: %g' % (lscore, )) plot_predictions(lrgb_model, 'LAB', labelCol='word')
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol='features') word_indexer = StringIndexer(inputCol='word', outputCol='new_word') classifier = MultilayerPerceptronClassifier(labelCol="new_word", layers=[3, 30, 11]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation data rgb_validation = rgb_model.transform(validation) # rgb_validation.show() plot_predictions(rgb_model, 'RGB', labelCol='word') vali_evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol='new_word') score = vali_evaluator.evaluate(rgb_validation) print('Validation score for RGB model: %g' % (score, )) # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate. rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sql_transformer = SQLTransformer(statement=rgb_to_lab_query) new_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol='features') new_pipeline = Pipeline( stages=[sql_transformer, new_assembler, word_indexer, classifier]) new_training = sql_transformer.transform(train) new_model = new_pipeline.fit(new_training) new_validation = new_model.transform(validation) #new_validation.show() new_vali_evaluator = MulticlassClassificationEvaluator( predictionCol='prediction', labelCol='new_word') new_score = new_vali_evaluator.evaluate(new_validation) print('Validation score for LAB model:', new_score) print('Validation score for LAB model:', new_score) print('Validation score for LAB model:', new_score) plot_predictions(new_model, 'LAB', labelCol="word")
def main(inputs): data = spark.read.csv(inputs, header=True, schema=colour_schema) numlabels = data.select('labelword').distinct().count() lab_query = rgb2lab_query(passthrough_columns=['labelword']) sqlTrans = SQLTransformer(statement=lab_query) rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features") lab_assembler = VectorAssembler(inputCols=["lL", "lA", "lB"], outputCol="features") indexer = StringIndexer(inputCol="labelword", outputCol="color_index", handleInvalid='error') rf = RandomForestClassifier(numTrees=25, maxDepth=20, labelCol="color_index", seed=42) mlp = MultilayerPerceptronClassifier(labelCol="color_index", maxIter=100, layers=[3, 250, numlabels]) models = [('RGB-forest', Pipeline(stages=[rgb_assembler, indexer, rf])), ('LAB-forest', Pipeline(stages=[sqlTrans, lab_assembler, indexer, rf])), ('RGB-MLP', Pipeline(stages=[rgb_assembler, indexer, mlp])), ('LAB-MLP', Pipeline(stages=[sqlTrans, lab_assembler, indexer, mlp]))] evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='color_index') # split data into training and testing train, test = data.randomSplit([0.8, 0.2]) train = train.cache() test = test.cache() for label, pipeline in models: model = pipeline.fit(train) # Output a visual representation of the predictions we're # making: uncomment when you have a model working plot_predictions(model, label) predictions = model.transform(test) # calculate a score score = evaluator.evaluate(predictions) print(label, score)
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # TODO: create a pipeline to predict RGB colours -> word # rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) # rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation data # plot_predictions(rgb_model, 'RGB', labelCol='word') # print('Validation score for RGB model: %g' % (score, )) rgb_to_lab_query = rgb2lab_query(passthrough_columns=[])
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25],seed=42) train = train.cache() validation = validation.cache() score_mpc = rgb_classify("MLPC",train,validation,"MLPC") score_Log = rgb_classify("LogReg",train,validation,"LogReg") rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) score_lab_mpc = lab_classify("MLPC",train,validation,rgb_to_lab_query,"MLPC") score_lab_Log = lab_classify("LogReg",train,validation,rgb_to_lab_query,"LogReg") print('Accuracy for RGB model using MultilayerPerceptronClassifier: %g' % (score_mpc, )) print('Accuracy for RGB model using LogisticRegression: %g' % (score_Log, )) print('Accuracy for LAB model using MultilayerPerceptronClassifier: %g' % (score_lab_mpc, )) print('Accuracy for LAB model using LogisticRegression: %g' % (score_lab_Log, ))
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25], seed=110) #use seed here train = train.cache() validation = validation.cache() word_indexer = StringIndexer(inputCol="word", outputCol="target", handleInvalid="error", stringOrderType="frequencyDesc") classifier = MultilayerPerceptronClassifier(maxIter=100, featuresCol="features", labelCol="target", layers=[3, 25, 25], seed=120) evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="target") #Evaluating RGB color space rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol="features") rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) rgb_validation = rgb_model.transform(validation) score = evaluator.evaluate(rgb_validation, {evaluator.metricName: "accuracy"}) print('Validation score for RGB model: %g' % (score, )) plot_predictions(rgb_model, 'RGB', labelCol='target') #Evaluating LAB color space rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sqlTrans = SQLTransformer(statement=rgb_to_lab_query) lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol="features") lab_pipeline = Pipeline( stages=[sqlTrans, lab_assembler, word_indexer, classifier]) lab_model = lab_pipeline.fit(train) lab_validation = lab_model.transform(validation) labscore = evaluator.evaluate(lab_validation, {evaluator.metricName: "accuracy"}) print('Validation score for LAB model: %g' % (labscore, )) plot_predictions(lab_model, 'LAB', labelCol='word')
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() #To convert R,G,B to LabCIE rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sql_transformed = SQLTransformer(statement=rgb_to_lab_query) rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol='features') lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol='features') word_indexer = StringIndexer(inputCol='word', outputCol='indexed') classifier = MultilayerPerceptronClassifier(labelCol='indexed', layers=[3, 30, 11]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) lab_pipeline = Pipeline( stages=[sql_transformed, lab_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) lab_model = lab_pipeline.fit(train) prediction = rgb_model.transform(validation) prediction_lab = lab_model.transform(validation) prediction.show() prediction_lab.show() #Testing the model evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='indexed', metricName='f1') lab_evaluator = MulticlassClassificationEvaluator( predictionCol='prediction', labelCol='indexed', metricName='f1') score = evaluator.evaluate(prediction) lab_score = lab_evaluator.evaluate(prediction_lab) plot_predictions(rgb_model, 'RGB', labelCol='word') plot_predictions(lab_model, 'LAB', labelCol='word') print('Validation score for RGB model: %g' % (score, )) print('Validation score for LAB model:', lab_score)
def main(inputs): data = spark.read.csv(inputs, header=True, schema=colour_schema) lab_query = rgb2lab_query(passthrough_columns=['labelword']) sqlTrans = SQLTransformer(statement=lab_query) #data=sqlTrans.transform(data) #data.show() # TODO: actually build the components for the pipelines, and the pipelines. indexer = StringIndexer(inputCol="labelword", outputCol="indexed", handleInvalid='error') rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features") lab_assembler= VectorAssembler(inputCols=["lL","lA","lB"],outputCol="features") # TODO: need an evaluator evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexed") # TODO: split data into training and testing train, test = data.randomSplit([0.8,0.2],seed=1234) train = train.cache() test = test.cache() rf = RandomForestClassifier(featuresCol="features",numTrees=30, labelCol="indexed", seed=42) mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol="indexed", layers=[3, 90,90, 11]) models = [ ('RGB-forest', Pipeline(stages=[indexer,rgb_assembler,rf])), ('RGB-MLP', Pipeline(stages=[indexer,rgb_assembler,mlp])), ('LAB-forest', Pipeline(stages=[sqlTrans,indexer,lab_assembler,rf])), ('LAB-MLP', Pipeline(stages=[sqlTrans,indexer,lab_assembler,mlp])), ] for label, pipeline in models: # TODO: fit the pipeline to create a model model=pipeline.fit(train) prediction=model.transform(test) # Output a visual representation of the predictions we're # making: uncomment when you have a model working plot_predictions(model, label) # TODO: predict on the test data #predictions = # calculate a score score = evaluator.evaluate(prediction) print(label, score)
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features") word_indexer = StringIndexer(inputCol="word", outputCol="label", handleInvalid='error') classifier = MultilayerPerceptronClassifier( layers=[3, 300, data.select('word').distinct().count()]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) # TODO: create an evaluator and score the validation evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") rgb_predictions = rgb_model.transform(validation) rgb_score = evaluator.evaluate(rgb_predictions) plot_predictions(rgb_model, 'RGB', labelCol='word') # TODO: create a pipeline to predict RGB colours -> word; train and evaluate. rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"], outputCol="features") sqlTrans = SQLTransformer(statement=rgb_to_lab_query) lab_pipline = Pipeline( stages=[sqlTrans, lab_assembler, word_indexer, classifier]) lab_model = lab_pipline.fit(train) lab_predictions = lab_model.transform(validation) lab_score = evaluator.evaluate(lab_predictions) plot_predictions(lab_model, 'LAB', labelCol='word') print('Validation score for RGB model: %g' % (rgb_score, )) print('Validation score for LAB model: %g' % (lab_score, ))
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # TODO: create a pipeline to predict RGB colours -> word rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features") lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"], outputCol="features") word_indexer = StringIndexer(inputCol="word", outputCol="label", handleInvalid="error") classifier_mlp = MultilayerPerceptronClassifier(layers=[3, 30, 11]) sqlTrans = SQLTransformer(statement=rgb_to_lab_query) pipelines = [ ('RGB', Pipeline(stages=[rgb_assembler, word_indexer, classifier_mlp])), ('LAB', Pipeline( stages=[sqlTrans, lab_assembler, word_indexer, classifier_mlp])) ] # TODO: create an evaluator and score the validation data evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") score = dict() for label, pipeline in pipelines: model = pipeline.fit(train) predictions = model.transform(validation) score[label] = evaluator.evaluate(predictions) plot_predictions(model, label, labelCol='word') # TODO: create a pipeline to predict RGB colours -> word; train and evaluate. return score
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() train.show() validation = validation.cache() rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features") word_indexer = StringIndexer(inputCol="word", outputCol="label") classifier = MultilayerPerceptronClassifier(layers=[3, 30, 11]) # was [3, 25, 25], but updated -GB rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) plot_predictions(rgb_model, 'RGB', labelCol='word') rgb_predictions = rgb_model.transform(validation) rgb_predictions.show() rgb_score_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy') rgb_score = rgb_score_evaluator.evaluate(rgb_predictions) print('Validation score for RGB model:', rgb_score) rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sqlTrans = SQLTransformer(statement = rgb_to_lab_query) lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol='features') lab_pipeline = Pipeline(stages=[sqlTrans, lab_assembler, word_indexer, classifier]) lab_model = lab_pipeline.fit(train) plot_predictions(lab_model, 'LAB', labelCol='word') lab_predictions = lab_model.transform(validation) lab_predictions.show() lab_score_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy') lab_score = lab_score_evaluator.evaluate(lab_predictions) print('Validation score for LAB model:', lab_score)
def main(inputs): data = spark.read.csv(inputs, header=True, schema=colour_schema) lab_query = rgb2lab_query(passthrough_columns=['labelword']) # TODO: actually build the components for the pipelines, and the pipelines. #indexer = #rgb_assembler = models = [ #('RGB-forest', Pipeline(stages=[])), #('LAB-forest', Pipeline(stages=[])), #('RGB-MLP', Pipeline(stages=[])), #('LAB-MLP', Pipeline(stages=[])), ] # TODO: need an evaluator #evaluator = # TODO: split data into training and testing #train, test = train = train.cache() test = test.cache() for label, pipeline in models: