def test_confusion_matrix(sdf): assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'], outputCol='features') rf = RandomForestClassifier(featuresCol='features', labelCol='Survived', numTrees=20) pipeline = Pipeline(stages=[assem, rf]) model = pipeline.fit(sdf.fillna(0.0)) predictions = model.transform(sdf.fillna(0.0)).select( 'probability', 'Survived') bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived') predictions = predictions.toHandy().to_metrics_RDD('probability', 'Survived') predictions = np.array(predictions.collect()) scm = bcm.confusionMatrix().toArray() pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .5) npt.assert_array_almost_equal(scm, pcm) scm = bcm.confusionMatrix(.3).toArray() pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .3) npt.assert_array_almost_equal(scm, pcm)
def printRddBinaryClassificationMetrics(self, predictions_and_labels): metrics = BinaryClassificationMetrics(predictions_and_labels) print "KAPPA=" + str( self.computeKappa(np.array(metrics.confusionMatrix().toArray()))) print "BA=" + str( self.computeBA(np.array(metrics.confusionMatrix().toArray()))) CMarray = metrics.confusionMatrix().toArray() #CMstring = ','.join(['%.5f' % num for num in CMarray]) print "CM=" + str(CMarray)
def main(): start = time.time() conf = SparkConf().setMaster("local").setAppName("income") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) income_df = load(sqlContext, csv_path=CSV_PATH) # income_df.show() # print(income_df.dtypes) # print(income_df.count()) features_df = preprocess(data_frame=income_df) # train, test split train_df, test_df = features_df.randomSplit([7.0, 3.0], 100) # logistic regression income_lr = LogisticRegression(featuresCol="features", labelCol="income_index", regParam=0.0, elasticNetParam=0.0, maxIter=200) income_model = income_lr.fit(train_df) # modeling print("Training:") training_summary = income_model.summary training_FPR = training_summary.roc.select('FPR').collect() training_TPR = training_summary.roc.select('TPR').collect() plot_roc(training_FPR, training_TPR, "pic/training_roc.jpg") training_recall = training_summary.pr.select('recall').collect() training_precision = training_summary.pr.select('precision').collect() # Area under ROC curve print("Training Area under ROC = %s" % training_summary.areaUnderROC) # accuracy print("Training Accuracy = %s" % training_summary.accuracy) plot_pr(training_recall, training_precision, "pic/training_pr.jpg") # evaluation print() print("Evaluation:") pred_df = income_model.transform(test_df).select("prediction", "income_index") raw_pred_df = income_model.transform(test_df).select( "probability", "income_index").rdd.map(lambda l: (float(l[0][1]), l[1])) metrics = BinaryClassificationMetrics(raw_pred_df) # Area under ROC curve print("Testing Area under ROC = %s" % metrics.areaUnderROC) # accuracy metrics = MulticlassMetrics(pred_df.rdd) print("Testing Accuracy = %s" % metrics.accuracy) # confusion matrix print("Testing Confusion Matrix:") print(metrics.confusionMatrix().toArray()) print("Total cost %fs" % (time.time() - start)) print("Done!")
# Save and load model #model.save(sc, "myDecisionTreeClassificationModel") #sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel") # INSTANTIATE METRICS OBJECT metrics = BinaryClassificationMetrics(labelsAndPredictions) # AREA UNDER PRECISION-RECALL CURVE print("Area under PR = %s" % metrics.areaUnderPR) # AREA UNDER ROC CURVE print("Area under ROC = %s" % metrics.areaUnderROC) metrics = MulticlassMetrics(labelsAndPredictions) # OVERALL STATISTICS precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() confusion_matrix = metrics.confusionMatrix().toArray() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) print('Confusion Matrix = %s' % confusion_matrix) # $example off$ # PREDICT ON TEST DATA WITH BEST/FINAL MODEL #predictionAndLabels = oneHotTESTbinary.map(lambda lp: (float(logitBest.predict(lp.features)), lp.label))