Example #1
0
def test_confusion_matrix(sdf):
    assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'],
                            outputCol='features')
    rf = RandomForestClassifier(featuresCol='features',
                                labelCol='Survived',
                                numTrees=20)
    pipeline = Pipeline(stages=[assem, rf])
    model = pipeline.fit(sdf.fillna(0.0))
    predictions = model.transform(sdf.fillna(0.0)).select(
        'probability', 'Survived')
    bcm = BinaryClassificationMetrics(predictions,
                                      scoreCol='probability',
                                      labelCol='Survived')

    predictions = predictions.toHandy().to_metrics_RDD('probability',
                                                       'Survived')
    predictions = np.array(predictions.collect())

    scm = bcm.confusionMatrix().toArray()
    pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .5)
    npt.assert_array_almost_equal(scm, pcm)

    scm = bcm.confusionMatrix(.3).toArray()
    pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .3)
    npt.assert_array_almost_equal(scm, pcm)
Example #2
0
 def printRddBinaryClassificationMetrics(self, predictions_and_labels):
     metrics = BinaryClassificationMetrics(predictions_and_labels)
     print "KAPPA=" + str(
         self.computeKappa(np.array(metrics.confusionMatrix().toArray())))
     print "BA=" + str(
         self.computeBA(np.array(metrics.confusionMatrix().toArray())))
     CMarray = metrics.confusionMatrix().toArray()
     #CMstring = ','.join(['%.5f' % num for num in CMarray])
     print "CM=" + str(CMarray)
def main():
    start = time.time()
    conf = SparkConf().setMaster("local").setAppName("income")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    income_df = load(sqlContext, csv_path=CSV_PATH)
    # income_df.show()
    # print(income_df.dtypes)
    # print(income_df.count())

    features_df = preprocess(data_frame=income_df)

    # train, test split
    train_df, test_df = features_df.randomSplit([7.0, 3.0], 100)

    # logistic regression

    income_lr = LogisticRegression(featuresCol="features",
                                   labelCol="income_index",
                                   regParam=0.0,
                                   elasticNetParam=0.0,
                                   maxIter=200)
    income_model = income_lr.fit(train_df)

    # modeling
    print("Training:")
    training_summary = income_model.summary
    training_FPR = training_summary.roc.select('FPR').collect()
    training_TPR = training_summary.roc.select('TPR').collect()
    plot_roc(training_FPR, training_TPR, "pic/training_roc.jpg")

    training_recall = training_summary.pr.select('recall').collect()
    training_precision = training_summary.pr.select('precision').collect()
    # Area under ROC curve
    print("Training Area under ROC = %s" % training_summary.areaUnderROC)
    # accuracy
    print("Training Accuracy = %s" % training_summary.accuracy)
    plot_pr(training_recall, training_precision, "pic/training_pr.jpg")

    # evaluation
    print()
    print("Evaluation:")
    pred_df = income_model.transform(test_df).select("prediction",
                                                     "income_index")
    raw_pred_df = income_model.transform(test_df).select(
        "probability",
        "income_index").rdd.map(lambda l: (float(l[0][1]), l[1]))
    metrics = BinaryClassificationMetrics(raw_pred_df)
    # Area under ROC curve
    print("Testing Area under ROC = %s" % metrics.areaUnderROC)
    # accuracy
    metrics = MulticlassMetrics(pred_df.rdd)
    print("Testing Accuracy = %s" % metrics.accuracy)

    # confusion matrix
    print("Testing Confusion Matrix:")
    print(metrics.confusionMatrix().toArray())
    print("Total cost %fs" % (time.time() - start))
    print("Done!")
    # Save and load model
    #model.save(sc, "myDecisionTreeClassificationModel")
    #sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")

    # INSTANTIATE METRICS OBJECT
    metrics = BinaryClassificationMetrics(labelsAndPredictions)

    # AREA UNDER PRECISION-RECALL CURVE
    print("Area under PR = %s" % metrics.areaUnderPR)

    # AREA UNDER ROC CURVE
    print("Area under ROC = %s" % metrics.areaUnderROC)
    metrics = MulticlassMetrics(labelsAndPredictions)

    # OVERALL STATISTICS
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    confusion_matrix = metrics.confusionMatrix().toArray()

    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    print('Confusion Matrix = %s' % confusion_matrix)

    # $example off$

# PREDICT ON TEST DATA WITH BEST/FINAL MODEL
#predictionAndLabels = oneHotTESTbinary.map(lambda lp: (float(logitBest.predict(lp.features)), lp.label))