コード例 #1
0
def evaluate(predictions):
    """
    Evaluation Metrics
    """
    # label to indexedLabel mappings
    # out = sorted(set([(i[0], i[1]) for i in predictions.select(predictions.label, predictions.indexedLabel).collect()]), key=lambda x: x[0])

    print "Predictions"
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and evaluate model
    predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    # Statistics by class
    labels = predictions.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    treeModel = model.stages[2]
    print treeModel # summary only
コード例 #2
0
def evaluate(labelsAndPredictions, data, labels):
    """
    Evaluation Metrics
    """
    # Instantiate metrics object
    metrics = MulticlassMetrics(labelsAndPredictions)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    # Statistics by class
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
コード例 #3
0
def NaiveBayesEvaluation(TransformedDataset):

    nb = NaiveBayes()
    nb.setLabelCol("LabelIndex")
    nb.setPredictionCol("Label_Prediction")
    training, test = TransformedDataset.randomSplit([0.8, 0.2], seed=11)
    nvModel = nb.fit(training)
    prediction = nvModel.transform(test)

    # selected = prediction.select("body", "LabelIndex", "label", "Label_Prediction")
    # for row in selected.collect():
    #     print(row)

    from pyspark.mllib.evaluation import MulticlassMetrics

    predictionAndLabels = prediction.select(
        "Label_Prediction",
        "LabelIndex").rdd.map(lambda r: (float(r[0]), float(r[1])))

    # predictionAndLabels = test.rdd.map(lambda lp: (float(nvModel.predict(lp.features)), lp.label))
    metrics = MulticlassMetrics(predictionAndLabels)

    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Statistics by class
    labels = prediction.rdd.map(lambda lp: lp.label).distinct().collect()
    labelIndices = prediction.rdd.map(
        lambda lp: lp.LabelIndex).distinct().collect()
    labelIndicesPairs = prediction.rdd.map(
        lambda lp: (lp.label, lp.LabelIndex)).distinct().collect()

    print("Labels", labels)
    print("Label Indices", labelIndices)
    print("Label Indice Pairs", labelIndicesPairs)

    for label, labelIndex in sorted(labelIndicesPairs):
        print("\n Class %s precision = %s" %
              (label, metrics.precision(labelIndex)))
        print("Class %s recall = %s" % (label, metrics.recall(labelIndex)))
        print(
            "Class %s F1 Measure = %s" %
            (label, metrics.fMeasure(labelIndex, beta=1.0)), "\n")

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" %
          metrics.weightedFalsePositiveRate)
コード例 #4
0
def evaluate_predictions(predictions, show=True):
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
    log = {}

    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['auroc'] = evaluator.evaluate(predictions)

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['aupr'] = evaluator.evaluate(predictions)

    # Metrics
    predictionRDD = predictions.select(
        ['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Overall statistics
    log['precision'] = metrics.precision()
    log['recall'] = metrics.recall()
    log['F1 Measure'] = metrics.fMeasure()

    # Statistics by class
    distinctPredictions = collect_tuple(
        predictions.select('prediction').distinct())
    for x in sorted(distinctPredictions):
        log[x] = {}
        log[x]['precision'] = metrics.precision(x)
        log[x]['recall'] = metrics.recall(x)
        log[x]['F1 Measure'] = metrics.fMeasure(x, beta=1.0)

    # Confusion Matrix
    log['cm'] = metrics.confusionMatrix().toArray()
    log['cmpercent'] = cm_percent(log['cm'], predictions.count(), show)

    if show:
        show_predictions(predictions)

        print('Confusion Matrix')
        print(' TP', 'FN\n', 'FP', 'TN')
        print(log['cm'])
        print(' PC', 'FN\n', 'FP', 'PW')
        print(log['cmpercent'])
        print('')
        print("Area under ROC = {}".format(log['auroc']))
        print("Area under AUPR = {}".format(log['aupr']))
        print('\nOverall\ntprecision = {}\nrecall = {}\nF1 Measure = {}\n'.
              format(log['precision'], log['recall'], log['F1 Measure']))

        for x in sorted(distinctPredictions):
            print('Label {}\ntprecision = {}\nrecall = {}\nF1 Measure = {}\n'.
                  format(x, log[x]['precision'], log[x]['recall'],
                         log[x]['F1 Measure']))

    return log
コード例 #5
0
def printMetrics(predictions_and_labels, output_file):
   metrics = MulticlassMetrics(predictions_and_labels)
   output_file.write('Precision of True '+str(metrics.precision(1))+'\n')
   output_file.write('Precision of False' + str(metrics.precision(0))+'\n')
   output_file.write('Recall of True  '+str(metrics.recall(1))+'\n')
   output_file.write('Recall of False   '+str(metrics.recall(0))+'\n')
   output_file.write('F-1 Score         '+str(metrics.fMeasure())+'\n')
   output_file.write('Confusion Matrix\n'+str(metrics.confusionMatrix().toArray())+'\n')

   print('Precision of True '+str(metrics.precision(1)))
   print('Precision of False'+str(metrics.precision(0)))
   print('Recall of True  '+str(metrics.recall(1)))
   print('Recall of False   '+str(metrics.recall(0)))
   print('F-1 Score         '+str(metrics.fMeasure()))
   print('Confusion Matrix\n'+str(metrics.confusionMatrix().toArray()))
コード例 #6
0
def main(spark, model_file, data_file):
    '''Main routine for supervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    ###
    # TODO: YOUR CODE GOES HERE
    #load best lr model
    model = PipelineModel.load(model_file)
    # Load the test dataframe
    test = spark.read.parquet(data_file)

    predictions = model.transform(test)

    predictionAndLabels = predictions.rdd.map(lambda lp:
                                              (lp.prediction, lp.label))
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Overall Stats:")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Weighted stats
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted F1 Score = %s" % metrics.weightedFMeasure())

    # Statistics by class
    print("Stats by class")

    for (genre, label) in predictions.select('genre',
                                             'label').distinct().collect():
        print("Class %s precision = %s" % (genre, metrics.precision(label)))
        print("Class %s recall = %s" % (genre, metrics.recall(label)))
        print("Class %s F1 Score = %s" %
              (genre, metrics.fMeasure(label, beta=1.0)))
コード例 #7
0
def multi_clf_performance(name, method, train, test):
    model = method.fit(train)
    prediction = model.transform(test)
    print(f"-----------Performance of {name} on testing set-----------")
    # Compute raw scores on the test set
    predictionAndLabels = prediction.select('prediction', 'label')
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels.rdd)
    # Overall statistics
    print("----------Summary Stats----------------------")
    print(f"Weighted precision: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'weightedPrecision'})}")
    print(f"Weighted recall: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'weightedRecall'})}")
    print(f"F1 Score: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'f1'})}")
    print(f"Accuracy: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'accuracy'})}")

    # Statistics by class
    print("--------Stats by class----------------------")
    labels = [row.asDict()['label'] for row in test.select('label').distinct().collect()]
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Score = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    #print("--------Weighted Stats----------------------")
    #print("Weighted precision = %s" % metrics.weightedPrecision)
    #print("Weighted recall = %s" % metrics.weightedRecall) 
    #print("Weighted F1 Score = %s" % metrics.weightedFMeasure())
    #print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    print("-----------------------------------------------------------")
コード例 #8
0
    def calculate_metrics(self, df):
        """

    define your own metrics to evaluate cross validation

    :params:

    df: dataframe containing {aprediction} and {label} columns

    :returns:

    confusion matrix

    """

        # turn gt into label
        preds_and_labels = df.select('prediction',
                                     f.col('label').cast(t.FloatType()))
        metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

        # confusion matrix
        metrics_dict = dict(
            # unweighted measures
            tpr=metrics.truePositiveRate(label=1.0),
            fpr=metrics.falsePositiveRate(label=1.0),
            precision=metrics.precision(label=1.0),
            recall=metrics.recall(label=1.0),
            fMeasure=metrics.fMeasure(label=1.0))

        metrics_dict = {
            k: round(v, 3) if k != "confusion" else v
            for k, v in metrics_dict.items()
        }

        return metrics_dict
コード例 #9
0
def evaluate(df, labelCols, gettopX=-1, getfirstX=-1):
    labelCols2 = [i + "_pred" for i in labelCols]
    df.cache()

    r_list = {
        i: np.zeros((len(labelCols)))
        for i in ['accuracy', 'precision', 'recall', 'fmeasure']
    }
    for i in xrange(len(labelCols)):
        predandlabels = df.select(labelCols2[i], labelCols[i]).rdd \
                        .map(lambda x: (float(x[labelCols2[i]]), float(x[labelCols[i]])))
        metrics = MulticlassMetrics(predandlabels)

        # print metrics.confusionMatrix()
        r_list['accuracy'][i] = metrics.accuracy
        r_list['precision'][i] = metrics.precision(1.0)
        r_list['recall'][i] = metrics.recall(1.0)
        r_list['fmeasure'][i] = metrics.fMeasure(label=1.0)

    results = {}
    for m, rs in r_list.iteritems():
        results[m] = np.mean(rs)

    for code, num in [('top', gettopX), ('first', getfirstX)]:
        if num <= 0: continue

        if code == 'top':
            idx = np.argsort(np.nan_to_num(r_list['fmeasure']))[-num:]
        elif code == 'first':
            idx = xrange(num)

        for m, rs in r_list.iteritems():
            results['{0}_{1}'.format(m, code)] = np.mean(rs[idx])

    return results
コード例 #10
0
def print_performance_metrics(predictions):
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    auc = evaluator.evaluate(predictions,
                             {evaluator.metricName: "areaUnderROC"})
    aupr = evaluator.evaluate(predictions,
                              {evaluator.metricName: "areaUnderPR"})
    print("auc = {}".format(auc))
    print("aupr = {}".format(aupr))

    # Get RDD of predictions and labels for eval metrics
    predictionAndLabels = predictions.select("prediction", "label").rdd

    # Instantiate metrics objects
    binary_metrics = BinaryClassificationMetrics(predictionAndLabels)
    multi_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = {}".format(binary_metrics.areaUnderPR))
    # Area under ROC curve
    print("Area under ROC = {}".format(binary_metrics.areaUnderROC))
    # Accuracy
    print("Accuracy = {}".format(multi_metrics.accuracy))
    # Confusion Matrix
    print(multi_metrics.confusionMatrix())
    # F1
    print("F1 = {}".format(multi_metrics.fMeasure(1.0)))
    # Precision
    print("Precision = {}".format(multi_metrics.precision(1.0)))
    # Recall
    print("Recall = {}".format(multi_metrics.recall(1.0)))
    # FPR
    print("FPR = {}".format(multi_metrics.falsePositiveRate(1.0)))
    # TPR
    print("TPR = {}".format(multi_metrics.truePositiveRate(1.0)))
コード例 #11
0
ファイル: hw3.py プロジェクト: MrMouse2019/Big-Data-HW
def getF1Score(model, test_df):
    pred = model.transform(test_df)
    pl = pred.select("label", "prediction").rdd.cache()
    metrics = MulticlassMetrics(pl)
    f1score = metrics.fMeasure()
    print("the F1-score of the model is : {}".format(f1score))
    return f1score
コード例 #12
0
def printMeasurementMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print('Precision Result of setosa: ', metrics.precision(1))
    print('Precision Result of versicolor:', metrics.precision(2))
    print('Precision Result of virginica:', metrics.precision(3))
    print('F-1 Score:         ', metrics.fMeasure())
    print('Confusion Matrix\n', metrics.confusionMatrix().toArray())
コード例 #13
0
def evaluate(model, word_column="words", vectorizer="w2v"):
    doc2vecs_df = featurize(word_column, vectorizer)
    if type(model) == LinearSVC:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.regParam, [0.1]) \
            .build()
    elif type(model) == GBTClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [50]) \
            .build()
    elif type(model) == RandomForestClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxBins, [100]) \
            .build()
    elif type(model) == MultilayerPerceptronClassifier:
        paramGrid = ParamGridBuilder() \
             .addGrid(model.layers, [[122, 50, 2]]) \
             .build()
        # .addGrid(model.layers, [[120, 2], [120, 50, 2], [120, 75, 50, 2]]) \
    elif type(model) == FMClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.stepSize, [.01, .001]) \
            .build()
    print('Evaluating...')
    w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])
    si = StringIndexer(inputCol="LABEL", outputCol="label")
    model_evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="f1")
    classifier_pipeline = Pipeline(stages=[si, model])
    crossval = CrossValidator(estimator=classifier_pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=model_evaluator,
                              numFolds=5)
    fit_model = crossval.fit(doc2vecs_df)
    predictions = fit_model.transform(w2v_test_df)
    # predictions.toPandas().to_csv('predictions.csv')
    # predictions.groupBy('prediction', 'label', 'PRODUCT_CATEGORY')
    # predictions.describe()
    summarizer = Summarizer.metrics("mean", "count")
    predictions.select(
        summarizer.summary(predictions.filter(
            predictions.label == 1).pos)).show(truncate=False)
    preds_and_labels = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
    print('Confusion Matrix')
    print(metrics.confusionMatrix().toArray())
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    accuracy = model_evaluator.evaluate(predictions)
    trainingSummary = fit_model.bestModel.stages[-1].extractParamMap()
    print(trainingSummary)

    return accuracy
コード例 #14
0
def printMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print 'Precision of True ', metrics.precision(1)
    print 'Precision of False', metrics.precision(0)
    print 'Recall of True    ', metrics.recall(1)
    print 'Recall of False   ', metrics.recall(0)
    print 'F-1 Score         ', metrics.fMeasure()
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()
コード例 #15
0
ファイル: test_mllib.py プロジェクト: Great1414/pyspark_learn
def printMetrics(result):
    metrics = MulticlassMetrics(result)
    print("\nPrecision of True\n", metrics.precision(1))
    print("\nPrecision of False\n", metrics.precision(0))
    print("\nRecall of True\n", metrics.recall(1))
    print("\nRecall of False\n", metrics.recall(0))
    print("\nF1 score\n", metrics.fMeasure())
    print("\nConfusion Matrix\n", metrics.confusionMatrix().toArray())
コード例 #16
0
def evaluate(df_prediction):
    evaluator = BinaryClassificationEvaluator()
    rc = evaluator.evaluate(df_prediction, {evaluator.metricName: "areaUnderROC"})
    pr = evaluator.evaluate(df_prediction, {evaluator.metricName: "areaUnderPR"})
    predictionRDD = df_prediction.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)
    f1 = metrics.fMeasure()
    return [roc,pr,f1]
コード例 #17
0
def displayMetrics(pred):
    ev = MulticlassMetrics(pred.select(["label", "prediction"]).rdd)

    # Overall statistics
    print("Accuracy = %s" % ev.accuracy)
    print("Precision = %s" % ev.precision())
    print("Recall = %s" % ev.recall())
    print("F1 Score = %s" % ev.fMeasure())
コード例 #18
0
def main(spark, model_file, data_file):
    '''Main routine for supervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    # Load data.
    dataset = spark.read.parquet(data_file)

    # Load  model.
    model = PipelineModel.load(model_file)

    prediction = model.transform(dataset)

    predictionAndLabels = prediction.select(["prediction", "label"]).rdd

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    print("\n")
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("\n")
    labels = predictionAndLabels.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" %
              (label, metrics.fMeasure(label, beta=1.0)))
コード例 #19
0
def evaluate(pl):  # input predictionsAndLabels
    testErr = predictionsAndLabels.filter(
        lambda lp: lp[0] != lp[1]).count() / float(predictionsAndLabels.count())
    metrics = MulticlassMetrics(predictionsAndLabels)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    return testErr, precision, recall, f1Score
コード例 #20
0
def metrics_basic(data):
    metrics = MulticlassMetrics(data)
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
コード例 #21
0
    def printMetrics(self, preds, prediction="indexedLabel", indexedLabel="prediction"):
        metrics = MulticlassMetrics(preds.select(prediction, indexedLabel).rdd)

        labels = [0, 1]
        for label in sorted(labels):
            try:
                print("Class %s precision = %s" % (label, metrics.precision(label)))
                print("Class %s recall = %s" % (label, metrics.recall(label)))
                print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
            except:
                print("No malicious predictions")
コード例 #22
0
def classification_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate calculate class level metrics
    metrics = MulticlassMetrics(prediction_and_labels)
    classes = set(actual_data.rdd.map(lambda x: x.test_label[0]).collect())
    print('Class\tPrecision\tRecall\tF-Score')
    for c in sorted(classes):
        print('{}\t{}\t{}\t{}'.format(c, round(metrics.precision(c), 3),
                                      round(metrics.recall(c), 3),
                                      round(metrics.fMeasure(c), 3)))
コード例 #23
0
def evaluate(predictionAndLabels):
    log = {}

    # Show Validation Score (AUROC)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['AUROC'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under ROC = {}".format(log['AUROC']))

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['AUPR'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under PR = {}".format(log['AUPR']))

    # Metrics
    predictionRDD = predictionAndLabels.select(['label', 'prediction']) \
                            .rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Confusion Matrix
    print(metrics.confusionMatrix().toArray())

    # Overall statistics
    log['precision'] = "%s" % metrics.precision()
    log['recall'] = "%s" % metrics.recall()
    log['F1 Measure'] = "%s" % metrics.fMeasure()
    print("[Overall]\tprecision = %s | recall = %s | F1 Measure = %s" % \
            (log['precision'], log['recall'], log['F1 Measure']))

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        log[label] = {}
        log[label]['precision'] = "%s" % metrics.precision(label)
        log[label]['recall'] = "%s" % metrics.recall(label)
        log[label]['F1 Measure'] = "%s" % metrics.fMeasure(label, beta=0.5)
        print("[Class %s]\tprecision = %s | recall = %s | F1 Measure = %s" \
                  % (label, log[label]['precision'],
                    log[label]['recall'], log[label]['F1 Measure']))

    return log
コード例 #24
0
    def evaluateClassification(self, predictionAndLabels):

        metrics = MulticlassMetrics(predictionAndLabels)
        cm = metrics.confusionMatrix()

        result = {}

        result['Matrix'] = cm.toArray().tolist()
        result['Precision'] = metrics.precision()
        result['Recall'] = metrics.recall()
        result['F1 Score'] = metrics.fMeasure()

        return result
コード例 #25
0
def overall_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate actual / predicted labels in rdd from
    metrics = MulticlassMetrics(prediction_and_labels)

    # Calculate overall level metrics
    # print('Precision:', metrics.precision(), type(metrics.precision()))
    return sc.parallelize([
        (Vectors.dense(metrics.precision()), Vectors.dense(metrics.recall()),
         Vectors.dense(metrics.fMeasure()))
    ]).toDF(['Precision', 'Recall', 'F-Score'])
コード例 #26
0
def main():
    sc = SparkContext(appName="BayesClassifer")
    htf = HashingTF(50000)
    data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels.csv')
    data_cleaned = data.map(lambda line : line.split(","))
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(label, htf.transform(text)))
    data_hashed.persist()
    # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    #print data
    # Split data aproximately into training (60%) and test (40%)
    training, test = data_hashed.randomSplit([0.70, 0.30], seed=0)

    sameModel = NaiveBayesModel.load(sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel")

    print "----------"
    print sameModel.predict(htf.transform("posts jump in net profit"))

    predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
    predictionAndLabel1 = training.map(lambda p: (sameModel.predict(p.features), p.label))
    prediction = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    prediction1 = 1.0 * predictionAndLabel1.filter(lambda (x, v): x == v).count() / training.count()
    buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v ==1).count()


    # Instantiate metrics object
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabel)

    # Overall statistics
    precision = metrics.precision()
    precision = normalize(precision)
    recall = metrics.recall()
    recall = normalize(recall)
    f1Score = metrics.fMeasure()
    f1Score = normalize(f1Score)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    '''
    # Statistics by class
    labels = data_hashed.map(lambda lp: lp.label).distinct().collect()

    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    '''
    '''
コード例 #27
0
def classification_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate calculate class level metrics
    metrics = MulticlassMetrics(prediction_and_labels)
    classes = set(actual_data.rdd.map(lambda x: x.test_labels[0]).collect())
    results = [(Vectors.dense(float(c)),
                Vectors.dense(round(metrics.precision(c), 3)),
                Vectors.dense(round(metrics.recall(c), 3)),
                Vectors.dense(round(metrics.fMeasure(c), 3)))
               for c in sorted(classes)]
    return sc.parallelize(results).toDF(
        ['Class', 'Precision', 'Recall', 'F-Score'])
def validate_tffm(spark, sc, model, test_df, s3_metrics_path, s3_endpoint_path):
    # get predictions
    validation_df = model.transform(test_df)
    
    metricsSchema = StructType() \
        .add("metric", StringType()) \
        .add("value", DoubleType())
    metrics_names = []

    # apply threshold
    def thresholdScore(x):
        retval = 0.0
        if x > 0.5:
            retval = 1.0
        return retval
    
    thresholdScoreUdf = F.UserDefinedFunction(thresholdScore, T.FloatType())
    
    validation_df_round = validation_df.withColumn('rscore', thresholdScoreUdf(validation_df.score)) 
    predTffm = validation_df_round.select(['label','rscore'])

    predictionAndLabelsTffm = predTffm.rdd.map(lambda lp: (lp.rscore, lp.label))
    metricsTffm = BinaryClassificationMetrics(predictionAndLabelsTffm)

    metrics_names.append(("Area_under_PR",metricsTffm.areaUnderPR))
    metrics_names.append(("Area_under_ROC",metricsTffm.areaUnderROC))

    mmetricsTffm = MulticlassMetrics(predictionAndLabelsTffm)
    metrics_names.append(("Precision",mmetricsTffm.precision()))
    metrics_names.append(("Recall",mmetricsTffm.recall()))
    metrics_names.append(("F1",mmetricsTffm.fMeasure()))
    metrics_names.append(("Weighted_recall",mmetricsTffm.weightedRecall))
    metrics_names.append(("Weighted_precision",mmetricsTffm.weightedPrecision))
    metrics_names.append(("Weighted_F1",mmetricsTffm.weightedFMeasure()))
    metrics_names.append(("Weighted_F05",mmetricsTffm.weightedFMeasure(beta=0.5)))
    metrics_names.append(("Weighted_FP_rate",mmetricsTffm.weightedFalsePositiveRate))

    mRdd = sc.parallelize(metrics_names).coalesce(1)
    dfMetrics = spark.createDataFrame(mRdd, metricsSchema)
    dfMetrics.write.csv("{0}/{1}".format(s3_metrics_path, model.endpointName), mode="overwrite")

    endpointSchema = StructType() \
        .add("time", StringType()) \
        .add("endpoint", StringType())
    endpoint_name = []
    endpoint_name.append((str(time.time()),str(model.endpointName)))
    eRdd = sc.parallelize(endpoint_name).coalesce(1)
    dfEndpoint = spark.createDataFrame(eRdd, endpointSchema)
    dfEndpoint.write.csv("{0}/endpoint.txt".format(s3_endpoint_path), mode="overwrite")
コード例 #29
0
ファイル: Json.py プロジェクト: honeycombcmu/SparkService
def generateJson(AlgorithmName, taskid, traindata, predictionAndLabels):
	jsonContent = dict()
	jsonContent['AlgorithmName'] = AlgorithmName
	jsonContent['TaskId'] = taskid

	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	jsonContent['LabelNum'] = len(labels)

	metrics = MulticlassMetrics(predictionAndLabels)
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	confusion_matrix = metrics.confusionMatrix().toArray()

	jsonContent['Precision'] = precision
	jsonContent['Recall'] = recall
	jsonContent['F1Score'] = f1Score
	jsonContent['ConfusionMatrix'] = confusion_matrix.tolist()

	jsonContent['Labels'] = list()
	for label in sorted(labels):
		tempList = dict()
		tempList['Precision'] = metrics.precision(label)
		tempList['Recall'] = metrics.recall(label)
		tempList['F1Measure'] = metrics.fMeasure(label, beta=1.0)

		jsonContent['Labels'].append(tempList)
	
	jsonContent['WeightedStats'] = dict()
	jsonContent['WeightedStats']['Precision'] = metrics.weightedRecall
	jsonContent['WeightedStats']['F1Score'] = metrics.weightedFMeasure()
	jsonContent['WeightedStats']['FalsePositiveRate'] = metrics.weightedFalsePositiveRate

	with open(taskid + '.json', 'w') as jsonFile:
		json.dump(jsonContent, jsonFile, indent=4, separators=(',', ': '))
		jsonFile.flush()
コード例 #30
0
def performance(predictions):
    predictionRDD = predictions.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
        
    binmetrics = BinaryClassificationMetrics(predictionRDD)
    metrics = MulticlassMetrics(predictionRDD)
    
    results = {'predictions':predictions,
               'areaUnderROC':binmetrics.areaUnderROC,
               'areaUnderPR':binmetrics.areaUnderPR,
               'confusionMatrix':metrics.confusionMatrix().toArray(),
               'accuracy':metrics.accuracy,
               'precision':metrics.precision(),
               'recall':metrics.recall(),
               'f1measure':metrics.fMeasure()}
    
    return results
コード例 #31
0
def overall_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate actual / predicted labels in rdd from
    metrics = MulticlassMetrics(prediction_and_labels)

    # Calculate overall level metrics
    # print('Precision:', metrics.precision(), type(metrics.precision()))
    # return sc.parallelize([(Vectors.dense(metrics.accuracy),
    #                         Vectors.dense(metrics.precision()),
    #                         Vectors.dense(metrics.recall()),
    #                         Vectors.dense(metrics.fMeasure()))]).toDF(['Accuracy', 'Precision', 'Recall', 'F - Score'])
    print('Accuracy\tPrecision\tRecall\tF-Score')
    print('{}\t{}\t{}\t{}'.format(metrics.accuracy, metrics.precision(),
                                  metrics.recall(), metrics.fMeasure()))
コード例 #32
0
 def performancerdd(self):
     self.calculator = 'RDDs'
     print('Calculating performance metrics using RDDs...')
     predictionRDD = self.predictions.select(['label','prediction']).rdd.map(lambda line: (line[1],line[0]))
     
     binmetrics = BinaryClassificationMetrics(predictionRDD)
     metrics = MulticlassMetrics(predictionRDD)
     
     self.areaUnderROC = binmetrics.areaUnderROC
     self.areaUnderPR = binmetrics.areaUnderPR
     self.confusionMatrix = metrics.confusionMatrix().toArray()
     self.accuracy = metrics.accuracy
     self.precision = metrics.precision()
     self.recall = metrics.recall()
     self.f1measure = metrics.fMeasure()
     self.falsePositive = metrics.falsePositiveRate(1.0)
     self.falseNegative = metrics.falsePositiveRate(0.0)
コード例 #33
0
def printFinalResultMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print '\n'
    print 'Precision of Setosa ', metrics.precision(1)
    print 'Precision of Versicolor', metrics.precision(2)
    print 'Precision of Virginica', metrics.precision(3)
    print '\n'
    print 'Recall of Setosa    ', metrics.recall(1)
    print 'Recall of Versicolor   ', metrics.recall(2)
    print 'Recall of Virginica   ', metrics.recall(3)

    print '\n'
    print 'F-1 Score         ', metrics.fMeasure()
    print '\n\n'
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()

    print '\n\n'
    return
コード例 #34
0
ファイル: main.py プロジェクト: GuruTeja/iHear-Server
def modelStatistics(labelsAndPredictions):
    metrics = MulticlassMetrics(labelsAndPredictions)
    print(metrics.confusionMatrix())

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
コード例 #35
0
ファイル: jordan_hive_spark.py プロジェクト: arifyali/Yelp
model = SVMWithSGD.train(trainParsed, iterations=100)

# Training Error
trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count())
print trainErr

# Test Error
testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count())
print testErr

metrics = BinaryClassificationMetrics(testLabelsAndPreds)

print metrics.areaUnderROC
print metrics.areaUnderPR

mcMetrics = MulticlassMetrics(testLabelsAndPreds)

#TODO: Do this for classes 1.0,0.0 and not just overall
print mcMetrics.precision()
print mcMetrics.recall()
print mcMetrics.fMeasure()

model.save(sc, "SVMModel")

### Run Model on Validation Set
## TODO: output file of zipcodes and predicted success metrics
## TODO: Use bokeh on file to make visualization of the US
コード例 #36
0
ファイル: evaluate.py プロジェクト: stevencox/chemotext
    def train_model (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        labeled = Evaluate.load_all (sc, conf). \
                  map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0,
                                                 features = [ b.paraDist, b.sentDist, b.docDist ] ) )

#        labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \
#                  map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0,
#                                                 [ b, b * 2, b * 9 ] ) )
#        print (labeled.collect ())

        train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345)

        count = train.count ()
        start = time.time ()
        model = LogisticRegressionWithLBFGS.train (train)
        elapsed = time.time () - start
        print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed))

        start = time.time ()
        model_path = os.path.join (conf.output_dir, "eval", "model")
        file_path = model_path.replace ("file://", "")
        if os.path.isdir (file_path):
            print ("Removing existing model {0}".format (file_path))
            shutil.rmtree (file_path)
        model.save(sc, model_path)
        sameModel = LogisticRegressionModel.load(sc, model_path)
        elapsed = time.time () - start
        print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed))


        # Metrics
        labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count())
        print("Training Error => {0}".format (trainErr))

        predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) ))
        metrics = MulticlassMetrics (predictionsAndLabels) 
        print (" --------------> {0}".format (predictionsAndLabels.take (1000)))

        #print (labelsAndPreds.collect ())
        print ("\nMETRICS:")
        try:
            print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0)))
            print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("precision          : {0}".format (metrics.precision(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("recall             : {0}".format (metrics.recall(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("fMeasure           : {0}".format (metrics.fMeasure(0.0, 2.0)))
        except:
            traceback.print_exc ()

        print ("confusion matrix   : {0}".format (metrics.confusionMatrix().toArray ()))
        print ("precision          : {0}".format (metrics.precision()))
        print ("recall             : {0}".format (metrics.recall()))
        print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate))
        print ("weighted precision : {0}".format (metrics.weightedPrecision))
        print ("weighted recall    : {0}".format (metrics.weightedRecall))
        print ("weight f measure   : {0}".format (metrics.weightedFMeasure()))
        print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0)))
        print ("")

        # Regression metrics
        predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) )

        regression_metrics = RegressionMetrics (predictedAndObserved)
        print ("explained variance......: {0}".format (regression_metrics.explainedVariance))
        print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError))
        print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError))
        print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError))
        print ("r2......................: {0}".format (regression_metrics.r2))
        print ("")

        labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features)))
        testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ())
        print ("Testing Error => {0}".format (testErr))
コード例 #37
0
    training, test = data.randomSplit([0.6, 0.4], seed=11)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=3)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
コード例 #38
0
def logisticRegression(trainFile, testFile, taskid, sc):
	# Load training data in LIBSVM format
	trainData = MLUtils.loadLibSVMFile(sc, trainFile)
	testData = MLUtils.loadLibSVMFile(sc, testFile)

	# Split data into training (60%) and test (40%)
	# traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	# traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	labelNum = trainData.map(lambda lp: lp.label).distinct().count()

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(trainData, numClasses=labelNum)

	# Compute raw scores on the test set
	predictionAndLabels = testData.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	Json.generateJson("LogisticRegression", taskid, trainData, predictionAndLabels);
	# Instantiate metrics object
	metrics = MulticlassMetrics(predictionAndLabels)

	# Overall statistics
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	#confusion_matrix = metrics.confusionMatrix().toArray()

	print("Summary Stats")
	print("Precision = %s" % precision)
	print("Recall = %s" % recall)
	print("F1 Score = %s" % f1Score)


	# Statistics by class
	labels = trainData.map(lambda lp: lp.label).distinct().collect()
	for label in sorted(labels):
	    print("Class %s precision = %s" % (label, metrics.precision(label)))
	    print("Class %s recall = %s" % (label, metrics.recall(label)))
	    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

	# Weighted stats
	print("Weighted recall = %s" % metrics.weightedRecall)
	print("Weighted precision = %s" % metrics.weightedPrecision)
	print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
	print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
	print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

	# #return model parameters
	# res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
	# 	   ('3','Yes','Precision', metrics.precision(0.0)),
	# 	   ('4','Yes','Recall', metrics.recall(0.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
	#        ('3','Yes','Precision', metrics.precision(1.0)),
	# 	   ('4','Yes','Recall', metrics.recall(1.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
	#        ('3','Yes','Precision', metrics.precision(2.0)),
	#        ('4','Yes','Recall', metrics.recall(2.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]

	# #save output file path as JSON and dump into dumpFilePath
	# rdd = sc.parallelize(res)
	# SQLContext.createDataFrame(rdd).collect()
	# df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value'])

	#tempDumpFilePath = dumpFilePath + "/part-00000"
	#if os.path.exists(tempDumpFilePath):
	#	os.remove(tempDumpFilePath)

	#df.toJSON().saveAsTextFile(hdfsFilePath)
	#tmpHdfsFilePath = hdfsFilePath + "/part-00000"
	#subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath])

	# Save and load model
	#clusters.save(sc, "myModel")
	#sameModel = KMeansModel.load(sc, "myModel")
コード例 #39
0
ファイル: multi_class_1.py プロジェクト: mon95/Kaggle-Scripts
    training, test = data.randomSplit([0.85, 0.15], seed=11L)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=39)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)
    
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    #accuracy = metrics.accuracy
    accuracy = 1.0 * predictionAndLabels.filter(lambda (x, v): x == v).count() / test.count()
    # print("Summary Stats")
    # print("Precision = %s" % precision)
    # print("Recall = %s" % recall)
    # print("F1 Score = %s" % f1Score)    
    # print("Accuracy = %s" % accuracy)
    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    # for label in sorted(labels):
    #     print("Class %s precision = %s" % (label, metrics.precision(label)))
    #     print("Class %s recall = %s" % (label, metrics.recall(label)))
    #     print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
コード例 #40
0
ファイル: cf_example.py プロジェクト: Lomascolo/hermes
def test_prfs():
    # TODO: revised so that it will take user's inputs instead of hardcoded values
    
    """
    Test Precision, Recall, Fscore, and Support on multiclass classification data
    Input data: https://github.com/apache/spark/blob/master/data/mllib/sample_multiclass_classification_data.txt.
    """

    # load the schemas (if existed)

    # create a hdfs directory
    #os.system("hdfs dfs -mkdir datasets")

    # load the data file into the hdfs directory
    os.system("hdfs dfs -put sample_multiclass_classification_data.txt datasets/sample_multiclass_classification_data.txt")
    data = MLUtils.loadLibSVMFile(scsingleton.sc, "hdfs://localhost:9000/datasets/sample_multiclass_classification_data.txt")
   
    # print data.take(1)
    # ie. [LabeledPoint(1.0, (4,[0,1,2,3],[-0.222222,0.5,-0.762712,-0.833333]))] 
    # [ ( finalClassification, (numLabels, [label0, label1, label2, ..., labelN], [prob0, prob1, prob2, ..., probN]) ) ]

    # split data into train (60%), test (40%)
    trainingRDD, testRDD = data.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    with Timer() as t:
        numTest = testRDD.count()
    print "testRDD.count(): %s seconds" % t.secs

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3)
    print "LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredAndLabel = testRDD.map(lambda lp: (float(model.predict(lp.features)), lp.label))
    print "testPredAndLabel: %s seconds" % t.secs

    # calculate Precision, Recall, F1-score
    metrics = MulticlassMetrics(testPredAndLabel)
    print( "precision = %s" % metrics.precision() )
    print( "recall = %s" % metrics.recall() )
    print( "f1-score = %s" % metrics.fMeasure() )

    # statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print( "Class %s precision = %s" % (label, metrics.precision(label)) )
        print( "Class %s recall = %s" % (label, metrics.recall(label)) )
        print( "Class %s f1-score = %s" % (label, metrics.fMeasure(label, beta=1.0)) )

    # weighted stats
    print( "Weighted precision = %s" % metrics.weightedPrecision )
    print( "Weighted recall = %s" % metrics.weightedRecall )
    print( "Weighted f1-score = %s" % metrics.weightedFMeasure() )
    print( "Weighted f(0.5)-score = %s" % metrics.weightedFMeasure(beta=0.5) )
    print( "Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate )
    
    return
コード例 #41
0
ファイル: PySpark.py プロジェクト: boweiz/SparkService
	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)

	# Compute raw scores on the test set
	predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	# Instantiate metrics object
	metrics = MulticlassMetrics(predictionAndLabels)

	# Overall statistics
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	#confusion_matrix = metrics.confusionMatrix().toArray()

	print("Summary Stats")
	print("Precision = %s" % precision)
	print("Recall = %s" % recall)
	print("F1 Score = %s" % f1Score)


	# Statistics by class
	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	for label in sorted(labels):
	    print("Class %s precision = %s" % (label, metrics.precision(label)))
	    print("Class %s recall = %s" % (label, metrics.recall(label)))
	    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))