print("Processing crossvalidation with 3-fold & 200/500 hidden layer units")
crossval = CrossValidator(estimator=pipeline,
                  estimatorParamMaps=paramGrid_MLP,
                  evaluator=evaluator,
                  numFolds=3)
starttime = datetime.datetime.now()
CV_model = crossval.fit(vectorizedData)
print CV_model.bestModel.stages[2]
print('Done on fitting model:%s'%(datetime.datetime.now()-starttime))

print("Transforming testing data...")
vectorized_test_data = testing_data.toDF()

#transformed_data1 = CV_model.transform(vectorizedData)
#print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1)
transformed_data = CV_model.transform(vectorized_test_data)
#print transformed_data.first()
print("Fitting testing data into model...")
print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data)

predictions = transformed_data.select('indexedLabel', 'prediction')
print predictions.describe().show()
print predictions.take(10)
print predictions.where(predictions.prediction != predictions.indexedLabel)



#predictAndLabel=valid.map(lambda p: (model.predict(p.features),p.label))
#accuracy = 1.0*predictAndLabel.filter(lambda (x, v): x == v).count()/valid.count()
#accuracy
# check predictions
predictions.take(10)

from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Print intercept and coefficients
print 'Model Intercept: ', lrModel.intercept
print 'Model weights: ', lrModel.coefficients

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

# AUC
evaluator.getMetricName()
evaluator.evaluate(predictions)

# Other metrics
trainingSummary = lrModel.summary
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))


#### Bonus: Visualisation example

# Simple visualisations
$ sudo yum install gnuplot
$ sudo /usr/local/bin/pip install gnuplotlib
$ sudo /usr/local/bin/pip install pandas
Example #3
0
featureColumns = [i for i in orignalData.columns if i != 'quality']
vectorass = VectorAssembler(inputCols=featureColumns, outputCol="Values")
transformation = vectorass.transform(orignalData)
transformation.cache()

feature = [i for i in validationData.columns if i != 'quality']
assemblerVal = VectorAssembler(inputCols=feature, outputCol="Values")
validationTransformation = assemblerVal.transform(validationData)

forest = RandomForestClassifier(labelCol="quality",
                                featuresCol="Values",
                                numTrees=100)
RFModel = forest.fit(transformation)

predictionData = RFModel.transform(validationTransformation)

print()
evaluation = MulticlassClassificationEvaluator(labelCol="quality",
                                               predictionCol="prediction",
                                               metricName="f1")
accuracy = evaluation.evaluate(predictionData)
print("Error Testing = %g" % (1.0 - accuracy))

print()

transformed_data = RFModel.transform(validationTransformation)
print(evaluation.getMetricName(), "Accuracy = ",
      evaluation.evaluate(transformed_data))
print()