def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    
    # TODO: create a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=['R','G','B'], outputCol='features')
    word_indexer = StringIndexer(inputCol='word', outputCol='label')
    classifier = MultilayerPerceptronClassifier(layers=[3, 30, 11])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    # TODO: create an evaluator and score the validation data
    predictions = rgb_model.transform(validation)
    rgb_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='f1')
    score = rgb_evaluator.evaluate(predictions)
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    print('Validation score for RGB model: %g' % (score, ))
    
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])

    
    # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate.
    lab = SQLTransformer(statement=rgb_to_lab_query)
    lab_assembler = VectorAssembler(inputCols=['labL','labA','labB'], outputCol='features')
    lab_pipeline = Pipeline(stages=[lab, lab_assembler, word_indexer, classifier])
    lab_model = lab_pipeline.fit(train)
    plot_predictions(lab_model, 'LAB', labelCol='word')
    lab_predictions = lab_model.transform(validation)
    lab_score = rgb_evaluator.evaluate(lab_predictions)
    print('Validation score for LAB model:', lab_score)
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    
    word_indexer = StringIndexer(inputCol = "word", outputCol = "labelCol", handleInvalid = 'error')
    classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol")
    # TODO: create a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols = ['R', 'G', 'B'], outputCol = "features")
    
    classifier = MultilayerPerceptronClassifier(maxIter = 400, layers = [3, 30, 11], blockSize = 1, seed = 123, labelCol = "labelCol")
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)
    
    # TODO: create an evaluator and score the validation data
    evaluator = MulticlassClassificationEvaluator(labelCol = "labelCol" , predictionCol = "prediction")
    
    predictions = rgb_model.transform(validation)
    score = evaluator.evaluate(predictions)
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    print('Validation score for RGB model: %g' % (score, ))
    
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=["word"])
    sqlTrans = SQLTransformer(statement = rgb_to_lab_query)
    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.
    lab_assembler = VectorAssembler(inputCols = ['labL', 'labA', 'labB'], outputCol = "features")
    lab_pipeline = Pipeline(stages=[sqlTrans,lab_assembler, word_indexer, classifier])
    lab_model = lab_pipeline.fit(train)

    predictions_lab = lab_model.transform(validation)
    score_lab = evaluator.evaluate(predictions_lab)
    plot_predictions(lab_model, 'LAB', labelCol='word')
    print('Validation score for LAB model: %g' % (score_lab, ))
Exemple #3
0
def main(inputs):
    data = spark.read.csv(inputs, header=True, schema=colour_schema)
    lab_query = rgb2lab_query(passthrough_columns=['labelword'])

    # TODO: actually build the components for the pipelines, and the pipelines.
    indexer = StringIndexer(inputCol="labelword",
                            outputCol="labelCol",
                            handleInvalid='error')

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol="features")
    lab_assembler = VectorAssembler(inputCols=['lL', 'lA', 'lB'],
                                    outputCol="features")

    forest = RandomForestClassifier(numTrees=22,
                                    maxDepth=10,
                                    labelCol="labelCol",
                                    seed=42)
    mlp = MultilayerPerceptronClassifier(maxIter=400,
                                         layers=[3, 16, 11],
                                         blockSize=1,
                                         seed=123,
                                         labelCol="labelCol")

    sqlTrans = SQLTransformer(statement=lab_query)

    models = [
        ('RGB-forest', Pipeline(stages=[indexer, rgb_assembler, forest])),
        ('LAB-forest',
         Pipeline(stages=[sqlTrans, indexer, lab_assembler, forest])),
        ('RGB-MLP', Pipeline(stages=[indexer, rgb_assembler, mlp])),
        ('LAB-MLP', Pipeline(stages=[sqlTrans, indexer, lab_assembler, mlp])),
    ]

    # TODO: need an evaluator
    evaluator = MulticlassClassificationEvaluator(labelCol="labelCol",
                                                  predictionCol="prediction")

    # TODO: split data into training and testing
    train, test = data.randomSplit([0.75, 0.25])
    train = train.cache()
    test = test.cache()
    score_dict = dict()
    for label, pipeline in models:
        # TODO: fit the pipeline to create a model
        model = pipeline.fit(train)

        # Output a visual representation of the predictions we're
        # making: uncomment when you have a model working
        plot_predictions(model, label)

        # TODO: predict on the test data
        predictions = model.transform(test)

        # calculate a score
        score = evaluator.evaluate(predictions)
        score_dict[label] = score
    return score_dict
def main(inputs):

    # Read the CSV File
    df = spark.read.csv(inputs, schema=colour_schema)

    # Total label count
    label_num = df.select('word').distinct().count()

    # Split the dataset. Make 75% as training set and the remaining 25% as validation set
    train, validation = df.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # Creating pipeline
    rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"],
                                    outputCol="features")
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="label",
                                 handleInvalid="error")
    classifier_mpc = MultilayerPerceptronClassifier(layers=[3, 250, label_num])

    # Transformer for the lab pipeline
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"],
                                    outputCol="features")

    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.

    #  pipeline to predict RGB colours
    rgb_pipeline = Pipeline(
        stages=[rgb_assembler, word_indexer, classifier_mpc])
    lab_pipeline = Pipeline(
        stages=[sqlTrans, lab_assembler, word_indexer, classifier_mpc])

    # Train the model
    rgb_model = rgb_pipeline.fit(train)
    lab_model = lab_pipeline.fit(train)

    # Transform the validation set
    predictions_rgb = rgb_model.transform(validation)
    predictions_lab = lab_model.transform(validation)

    # TODO: create an evaluator and score the validation data

    # Create a Multiclass Classification Evaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

    # Evaluate it on validation data
    score_rgb = evaluator.evaluate(predictions_rgb)
    score_lab = evaluator.evaluate(predictions_lab)

    plot_predictions(rgb_model, 'RGB', labelCol='word')
    plot_predictions(lab_model, 'LAB', labelCol='word')

    # Print the validation scores
    print('Validation score for RGB model: %g' % (score_rgb, ))
    print('Validation score for LAB model: %g' % (score_lab, ))
Exemple #5
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])  #use seed here
    train = train.cache()
    validation = validation.cache()

    #creating a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol="features")
    #dataframe1 = rgb_assembler.transform(data)
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="target",
                                 handleInvalid="error",
                                 stringOrderType="frequencyDesc")
    classifier = MultilayerPerceptronClassifier(featuresCol="features",
                                                labelCol="target",
                                                layers=[3, 25, 25])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    #creating an evaluator and score the validation data
    #model_train = rgb_model.transform(train)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="target")
    rgb_validation = rgb_model.transform(validation)
    score = evaluator.evaluate(rgb_validation,
                               {evaluator.metricName: "accuracy"})

    print('Validation score for RGB model: %g' % (score, ))
    plot_predictions(rgb_model, 'RGB', labelCol='target')

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    # creating a pipeline to predict RGB colours -> word; train and evaluate.
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    labdata = sqlTrans.transform(data)
    ltrain, lvalidation = labdata.randomSplit([0.75, 0.25])
    lrgb_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                     outputCol="LAB")
    lword_indexer = StringIndexer(inputCol="word",
                                  outputCol="labTarget",
                                  handleInvalid="error",
                                  stringOrderType="frequencyDesc")
    lclassifier = MultilayerPerceptronClassifier(featuresCol="LAB",
                                                 labelCol="labTarget",
                                                 layers=[3, 25, 25])
    lrgb_pipeline = Pipeline(
        stages=[sqlTrans, lrgb_assembler, lword_indexer, lclassifier])
    lrgb_model = lrgb_pipeline.fit(ltrain)
    #lmodel_train = lrgb_model.transform(ltrain)
    lrgb_validation = lrgb_model.transform(lvalidation)
    print(lrgb_validation.show())
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="labTarget")
    lscore = evaluator.evaluate(lrgb_validation,
                                {evaluator.metricName: "accuracy"})

    print('Validation score for LAB model: %g' % (lscore, ))
    plot_predictions(lrgb_model, 'LAB', labelCol='word')
Exemple #6
0
def main(inputs):

    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # TODO: create a pipeline to predict RGB colours -> word

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol='features')
    word_indexer = StringIndexer(inputCol='word', outputCol='new_word')
    classifier = MultilayerPerceptronClassifier(labelCol="new_word",
                                                layers=[3, 30, 11])
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    # TODO: create an evaluator and score the validation data

    rgb_validation = rgb_model.transform(validation)
    # rgb_validation.show()
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    vali_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol='new_word')
    score = vali_evaluator.evaluate(rgb_validation)
    print('Validation score for RGB model: %g' % (score, ))

    # TODO: create a pipeline RGB colours -> LAB colours -> word; train and evaluate.

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sql_transformer = SQLTransformer(statement=rgb_to_lab_query)

    new_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                    outputCol='features')
    new_pipeline = Pipeline(
        stages=[sql_transformer, new_assembler, word_indexer, classifier])
    new_training = sql_transformer.transform(train)
    new_model = new_pipeline.fit(new_training)
    new_validation = new_model.transform(validation)

    #new_validation.show()

    new_vali_evaluator = MulticlassClassificationEvaluator(
        predictionCol='prediction', labelCol='new_word')
    new_score = new_vali_evaluator.evaluate(new_validation)
    print('Validation score for LAB model:', new_score)
    print('Validation score for LAB model:', new_score)
    print('Validation score for LAB model:', new_score)

    plot_predictions(new_model, 'LAB', labelCol="word")
def main(inputs):
    data = spark.read.csv(inputs, header=True, schema=colour_schema)
    numlabels = data.select('labelword').distinct().count()
    lab_query = rgb2lab_query(passthrough_columns=['labelword'])

    sqlTrans = SQLTransformer(statement=lab_query)
    rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"],
                                    outputCol="features")
    lab_assembler = VectorAssembler(inputCols=["lL", "lA", "lB"],
                                    outputCol="features")
    indexer = StringIndexer(inputCol="labelword",
                            outputCol="color_index",
                            handleInvalid='error')

    rf = RandomForestClassifier(numTrees=25,
                                maxDepth=20,
                                labelCol="color_index",
                                seed=42)
    mlp = MultilayerPerceptronClassifier(labelCol="color_index",
                                         maxIter=100,
                                         layers=[3, 250, numlabels])

    models = [('RGB-forest', Pipeline(stages=[rgb_assembler, indexer, rf])),
              ('LAB-forest',
               Pipeline(stages=[sqlTrans, lab_assembler, indexer, rf])),
              ('RGB-MLP', Pipeline(stages=[rgb_assembler, indexer, mlp])),
              ('LAB-MLP',
               Pipeline(stages=[sqlTrans, lab_assembler, indexer, mlp]))]

    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol='color_index')

    # split data into training and testing
    train, test = data.randomSplit([0.8, 0.2])
    train = train.cache()
    test = test.cache()

    for label, pipeline in models:
        model = pipeline.fit(train)

        # Output a visual representation of the predictions we're
        # making: uncomment when you have a model working
        plot_predictions(model, label)

        predictions = model.transform(test)
        # calculate a score
        score = evaluator.evaluate(predictions)
        print(label, score)
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()
    
    # TODO: create a pipeline to predict RGB colours -> word
    
    # rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    # rgb_model = rgb_pipeline.fit(train)
    
    # TODO: create an evaluator and score the validation data
    
    # plot_predictions(rgb_model, 'RGB', labelCol='word')
    # print('Validation score for RGB model: %g' % (score, ))
    
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=[])
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25],seed=42)
    train = train.cache()
    validation = validation.cache()

    score_mpc = rgb_classify("MLPC",train,validation,"MLPC")
    score_Log = rgb_classify("LogReg",train,validation,"LogReg")

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    score_lab_mpc = lab_classify("MLPC",train,validation,rgb_to_lab_query,"MLPC")
    score_lab_Log = lab_classify("LogReg",train,validation,rgb_to_lab_query,"LogReg")

    print('Accuracy for RGB model using MultilayerPerceptronClassifier: %g' % (score_mpc, ))
    print('Accuracy for RGB model using LogisticRegression: %g' % (score_Log, ))
    print('Accuracy for LAB model using MultilayerPerceptronClassifier: %g' % (score_lab_mpc, ))
    print('Accuracy for LAB model using LogisticRegression: %g' % (score_lab_Log, ))
Exemple #10
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25],
                                         seed=110)  #use seed here
    train = train.cache()
    validation = validation.cache()

    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="target",
                                 handleInvalid="error",
                                 stringOrderType="frequencyDesc")
    classifier = MultilayerPerceptronClassifier(maxIter=100,
                                                featuresCol="features",
                                                labelCol="target",
                                                layers=[3, 25, 25],
                                                seed=120)
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                                  labelCol="target")

    #Evaluating RGB color space
    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol="features")
    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)
    rgb_validation = rgb_model.transform(validation)
    score = evaluator.evaluate(rgb_validation,
                               {evaluator.metricName: "accuracy"})

    print('Validation score for RGB model: %g' % (score, ))
    plot_predictions(rgb_model, 'RGB', labelCol='target')

    #Evaluating LAB color space
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                    outputCol="features")
    lab_pipeline = Pipeline(
        stages=[sqlTrans, lab_assembler, word_indexer, classifier])
    lab_model = lab_pipeline.fit(train)
    lab_validation = lab_model.transform(validation)
    labscore = evaluator.evaluate(lab_validation,
                                  {evaluator.metricName: "accuracy"})

    print('Validation score for LAB model: %g' % (labscore, ))
    plot_predictions(lab_model, 'LAB', labelCol='word')
Exemple #11
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    #To convert R,G,B to LabCIE
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sql_transformed = SQLTransformer(statement=rgb_to_lab_query)

    rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'],
                                    outputCol='features')
    lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'],
                                    outputCol='features')

    word_indexer = StringIndexer(inputCol='word', outputCol='indexed')
    classifier = MultilayerPerceptronClassifier(labelCol='indexed',
                                                layers=[3, 30, 11])

    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    lab_pipeline = Pipeline(
        stages=[sql_transformed, lab_assembler, word_indexer, classifier])

    rgb_model = rgb_pipeline.fit(train)
    lab_model = lab_pipeline.fit(train)

    prediction = rgb_model.transform(validation)
    prediction_lab = lab_model.transform(validation)
    prediction.show()
    prediction_lab.show()

    #Testing the model
    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='indexed',
                                                  metricName='f1')
    lab_evaluator = MulticlassClassificationEvaluator(
        predictionCol='prediction', labelCol='indexed', metricName='f1')
    score = evaluator.evaluate(prediction)
    lab_score = lab_evaluator.evaluate(prediction_lab)
    plot_predictions(rgb_model, 'RGB', labelCol='word')
    plot_predictions(lab_model, 'LAB', labelCol='word')
    print('Validation score for RGB model: %g' % (score, ))
    print('Validation score for LAB model:', lab_score)
Exemple #12
0
def main(inputs):
    data = spark.read.csv(inputs, header=True, schema=colour_schema)
    lab_query = rgb2lab_query(passthrough_columns=['labelword'])
    sqlTrans = SQLTransformer(statement=lab_query)
    #data=sqlTrans.transform(data)
    #data.show()
    # TODO: actually build the components for the pipelines, and the pipelines.
    indexer = StringIndexer(inputCol="labelword", outputCol="indexed", handleInvalid='error')
    rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features")
    lab_assembler= VectorAssembler(inputCols=["lL","lA","lB"],outputCol="features")
    # TODO: need an evaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",labelCol="indexed")
    # TODO: split data into training and testing
    train, test = data.randomSplit([0.8,0.2],seed=1234)
    train = train.cache()
    test = test.cache()
    rf = RandomForestClassifier(featuresCol="features",numTrees=30, labelCol="indexed", seed=42)
    mlp = MultilayerPerceptronClassifier(featuresCol="features", labelCol="indexed", layers=[3, 90,90, 11])
    models = [
        ('RGB-forest', Pipeline(stages=[indexer,rgb_assembler,rf])),
        ('RGB-MLP', Pipeline(stages=[indexer,rgb_assembler,mlp])),
        ('LAB-forest', Pipeline(stages=[sqlTrans,indexer,lab_assembler,rf])),
        ('LAB-MLP', Pipeline(stages=[sqlTrans,indexer,lab_assembler,mlp])),
    ]
    for label, pipeline in models:
        # TODO: fit the pipeline to create a model
        model=pipeline.fit(train)
        prediction=model.transform(test)
        # Output a visual representation of the predictions we're
        # making: uncomment when you have a model working
        plot_predictions(model, label)

        # TODO: predict on the test data
        #predictions =

        # calculate a score
        score = evaluator.evaluate(prediction)
        print(label, score)
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # TODO: create a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"],
                                    outputCol="features")
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="label",
                                 handleInvalid='error')
    classifier = MultilayerPerceptronClassifier(
        layers=[3, 300, data.select('word').distinct().count()])

    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)

    # TODO: create an evaluator and score the validation
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    rgb_predictions = rgb_model.transform(validation)
    rgb_score = evaluator.evaluate(rgb_predictions)
    plot_predictions(rgb_model, 'RGB', labelCol='word')

    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"],
                                    outputCol="features")
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)
    lab_pipline = Pipeline(
        stages=[sqlTrans, lab_assembler, word_indexer, classifier])
    lab_model = lab_pipline.fit(train)
    lab_predictions = lab_model.transform(validation)
    lab_score = evaluator.evaluate(lab_predictions)
    plot_predictions(lab_model, 'LAB', labelCol='word')
    print('Validation score for RGB model: %g' % (rgb_score, ))
    print('Validation score for LAB model: %g' % (lab_score, ))
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    validation = validation.cache()

    # TODO: create a pipeline to predict RGB colours -> word
    rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"],
                                    outputCol="features")
    lab_assembler = VectorAssembler(inputCols=["labL", "labA", "labB"],
                                    outputCol="features")
    word_indexer = StringIndexer(inputCol="word",
                                 outputCol="label",
                                 handleInvalid="error")

    classifier_mlp = MultilayerPerceptronClassifier(layers=[3, 30, 11])
    sqlTrans = SQLTransformer(statement=rgb_to_lab_query)

    pipelines = [
        ('RGB',
         Pipeline(stages=[rgb_assembler, word_indexer, classifier_mlp])),
        ('LAB',
         Pipeline(
             stages=[sqlTrans, lab_assembler, word_indexer, classifier_mlp]))
    ]

    # TODO: create an evaluator and score the validation data
    evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
    score = dict()
    for label, pipeline in pipelines:
        model = pipeline.fit(train)
        predictions = model.transform(validation)
        score[label] = evaluator.evaluate(predictions)
        plot_predictions(model, label, labelCol='word')
    # TODO: create a pipeline to predict RGB colours -> word; train and evaluate.
    return score
Exemple #15
0
def main(inputs):
    data = spark.read.csv(inputs, schema=colour_schema)
    train, validation = data.randomSplit([0.75, 0.25])
    train = train.cache()
    train.show()
    validation = validation.cache()

    rgb_assembler = VectorAssembler(inputCols=["R", "G", "B"], outputCol="features")
    word_indexer = StringIndexer(inputCol="word", outputCol="label")
    classifier = MultilayerPerceptronClassifier(layers=[3, 30, 11]) # was [3, 25, 25], but updated -GB


    rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier])
    rgb_model = rgb_pipeline.fit(train)
    plot_predictions(rgb_model, 'RGB', labelCol='word')

    rgb_predictions = rgb_model.transform(validation)
    rgb_predictions.show()
    rgb_score_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    rgb_score = rgb_score_evaluator.evaluate(rgb_predictions)

    print('Validation score for RGB model:', rgb_score)

    rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word'])
    sqlTrans = SQLTransformer(statement = rgb_to_lab_query)
    lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol='features')
    lab_pipeline = Pipeline(stages=[sqlTrans, lab_assembler, word_indexer, classifier])
    lab_model = lab_pipeline.fit(train)
    plot_predictions(lab_model, 'LAB', labelCol='word')

    lab_predictions = lab_model.transform(validation)
    lab_predictions.show()
    lab_score_evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label', metricName='accuracy')
    lab_score = lab_score_evaluator.evaluate(lab_predictions)

    print('Validation score for LAB model:', lab_score)
def main(inputs):
    data = spark.read.csv(inputs, header=True, schema=colour_schema)
    lab_query = rgb2lab_query(passthrough_columns=['labelword'])

    # TODO: actually build the components for the pipelines, and the pipelines.
    #indexer = 
    #rgb_assembler = 
    
    models = [
        #('RGB-forest', Pipeline(stages=[])),
        #('LAB-forest', Pipeline(stages=[])),
        #('RGB-MLP', Pipeline(stages=[])),
        #('LAB-MLP', Pipeline(stages=[])),
    ]

    # TODO: need an evaluator
    #evaluator = 

    # TODO: split data into training and testing
    #train, test = 
    train = train.cache()
    test = test.cache()

    for label, pipeline in models: