print("Processing crossvalidation with 3-fold & 200/500 hidden layer units") crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid_MLP, evaluator=evaluator, numFolds=3) starttime = datetime.datetime.now() CV_model = crossval.fit(vectorizedData) print CV_model.bestModel.stages[2] print('Done on fitting model:%s'%(datetime.datetime.now()-starttime)) print("Transforming testing data...") vectorized_test_data = testing_data.toDF() #transformed_data1 = CV_model.transform(vectorizedData) #print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1) transformed_data = CV_model.transform(vectorized_test_data) #print transformed_data.first() print("Fitting testing data into model...") print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data) predictions = transformed_data.select('indexedLabel', 'prediction') print predictions.describe().show() print predictions.take(10) print predictions.where(predictions.prediction != predictions.indexedLabel) #predictAndLabel=valid.map(lambda p: (model.predict(p.features),p.label)) #accuracy = 1.0*predictAndLabel.filter(lambda (x, v): x == v).count()/valid.count() #accuracy
# check predictions predictions.take(10) from pyspark.ml.evaluation import BinaryClassificationEvaluator # Print intercept and coefficients print 'Model Intercept: ', lrModel.intercept print 'Model weights: ', lrModel.coefficients # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") evaluator.evaluate(predictions) # AUC evaluator.getMetricName() evaluator.evaluate(predictions) # Other metrics trainingSummary = lrModel.summary trainingSummary.roc.show() print("areaUnderROC: " + str(trainingSummary.areaUnderROC)) #### Bonus: Visualisation example # Simple visualisations $ sudo yum install gnuplot $ sudo /usr/local/bin/pip install gnuplotlib $ sudo /usr/local/bin/pip install pandas
featureColumns = [i for i in orignalData.columns if i != 'quality'] vectorass = VectorAssembler(inputCols=featureColumns, outputCol="Values") transformation = vectorass.transform(orignalData) transformation.cache() feature = [i for i in validationData.columns if i != 'quality'] assemblerVal = VectorAssembler(inputCols=feature, outputCol="Values") validationTransformation = assemblerVal.transform(validationData) forest = RandomForestClassifier(labelCol="quality", featuresCol="Values", numTrees=100) RFModel = forest.fit(transformation) predictionData = RFModel.transform(validationTransformation) print() evaluation = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="f1") accuracy = evaluation.evaluate(predictionData) print("Error Testing = %g" % (1.0 - accuracy)) print() transformed_data = RFModel.transform(validationTransformation) print(evaluation.getMetricName(), "Accuracy = ", evaluation.evaluate(transformed_data)) print()