Python MulticlassMetrics.fMeasureの例、pyspark.mllib.evaluation.MulticlassMetrics.fMeasure Pythonの例

コード例 #1

0

ファイルを表示

ファイル: GdeltDecisionTree.py プロジェクト: liber-pater/ProjectThales

def evaluate(predictions):
    """
    Evaluation Metrics
    """
    # label to indexedLabel mappings
    # out = sorted(set([(i[0], i[1]) for i in predictions.select(predictions.label, predictions.indexedLabel).collect()]), key=lambda x: x[0])

    print "Predictions"
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and evaluate model
    predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    # Statistics by class
    labels = predictions.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    treeModel = model.stages[2]
    print treeModel # summary only

コード例 #2

0

ファイルを表示

ファイル: GdeltDecisionTree-mllib.py プロジェクト: liber-pater/ProjectThales

def evaluate(labelsAndPredictions, data, labels):
    """
    Evaluation Metrics
    """
    # Instantiate metrics object
    metrics = MulticlassMetrics(labelsAndPredictions)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    # Statistics by class
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

コード例 #3

0

ファイルを表示

def NaiveBayesEvaluation(TransformedDataset):

    nb = NaiveBayes()
    nb.setLabelCol("LabelIndex")
    nb.setPredictionCol("Label_Prediction")
    training, test = TransformedDataset.randomSplit([0.8, 0.2], seed=11)
    nvModel = nb.fit(training)
    prediction = nvModel.transform(test)

    # selected = prediction.select("body", "LabelIndex", "label", "Label_Prediction")
    # for row in selected.collect():
    #     print(row)

    from pyspark.mllib.evaluation import MulticlassMetrics

    predictionAndLabels = prediction.select(
        "Label_Prediction",
        "LabelIndex").rdd.map(lambda r: (float(r[0]), float(r[1])))

    # predictionAndLabels = test.rdd.map(lambda lp: (float(nvModel.predict(lp.features)), lp.label))
    metrics = MulticlassMetrics(predictionAndLabels)

    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Statistics by class
    labels = prediction.rdd.map(lambda lp: lp.label).distinct().collect()
    labelIndices = prediction.rdd.map(
        lambda lp: lp.LabelIndex).distinct().collect()
    labelIndicesPairs = prediction.rdd.map(
        lambda lp: (lp.label, lp.LabelIndex)).distinct().collect()

    print("Labels", labels)
    print("Label Indices", labelIndices)
    print("Label Indice Pairs", labelIndicesPairs)

    for label, labelIndex in sorted(labelIndicesPairs):
        print("\n Class %s precision = %s" %
              (label, metrics.precision(labelIndex)))
        print("Class %s recall = %s" % (label, metrics.recall(labelIndex)))
        print(
            "Class %s F1 Measure = %s" %
            (label, metrics.fMeasure(labelIndex, beta=1.0)), "\n")

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" %
          metrics.weightedFalsePositiveRate)

コード例 #4

0

ファイルを表示

ファイル: pyspark_helpers.py プロジェクト: roitraining/j_hadoop

def evaluate_predictions(predictions, show=True):
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
    log = {}

    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['auroc'] = evaluator.evaluate(predictions)

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['aupr'] = evaluator.evaluate(predictions)

    # Metrics
    predictionRDD = predictions.select(
        ['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Overall statistics
    log['precision'] = metrics.precision()
    log['recall'] = metrics.recall()
    log['F1 Measure'] = metrics.fMeasure()

    # Statistics by class
    distinctPredictions = collect_tuple(
        predictions.select('prediction').distinct())
    for x in sorted(distinctPredictions):
        log[x] = {}
        log[x]['precision'] = metrics.precision(x)
        log[x]['recall'] = metrics.recall(x)
        log[x]['F1 Measure'] = metrics.fMeasure(x, beta=1.0)

    # Confusion Matrix
    log['cm'] = metrics.confusionMatrix().toArray()
    log['cmpercent'] = cm_percent(log['cm'], predictions.count(), show)

    if show:
        show_predictions(predictions)

        print('Confusion Matrix')
        print(' TP', 'FN\n', 'FP', 'TN')
        print(log['cm'])
        print(' PC', 'FN\n', 'FP', 'PW')
        print(log['cmpercent'])
        print('')
        print("Area under ROC = {}".format(log['auroc']))
        print("Area under AUPR = {}".format(log['aupr']))
        print('\nOverall\ntprecision = {}\nrecall = {}\nF1 Measure = {}\n'.
              format(log['precision'], log['recall'], log['F1 Measure']))

        for x in sorted(distinctPredictions):
            print('Label {}\ntprecision = {}\nrecall = {}\nF1 Measure = {}\n'.
                  format(x, log[x]['precision'], log[x]['recall'],
                         log[x]['F1 Measure']))

    return log

コード例 #5

0

ファイルを表示

def printMetrics(predictions_and_labels, output_file):
   metrics = MulticlassMetrics(predictions_and_labels)
   output_file.write('Precision of True '+str(metrics.precision(1))+'\n')
   output_file.write('Precision of False' + str(metrics.precision(0))+'\n')
   output_file.write('Recall of True  '+str(metrics.recall(1))+'\n')
   output_file.write('Recall of False   '+str(metrics.recall(0))+'\n')
   output_file.write('F-1 Score         '+str(metrics.fMeasure())+'\n')
   output_file.write('Confusion Matrix\n'+str(metrics.confusionMatrix().toArray())+'\n')

   print('Precision of True '+str(metrics.precision(1)))
   print('Precision of False'+str(metrics.precision(0)))
   print('Recall of True  '+str(metrics.recall(1)))
   print('Recall of False   '+str(metrics.recall(0)))
   print('F-1 Score         '+str(metrics.fMeasure()))
   print('Confusion Matrix\n'+str(metrics.confusionMatrix().toArray()))

コード例 #6

0

ファイルを表示

def main(spark, model_file, data_file):
    '''Main routine for supervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    ###
    # TODO: YOUR CODE GOES HERE
    #load best lr model
    model = PipelineModel.load(model_file)
    # Load the test dataframe
    test = spark.read.parquet(data_file)

    predictions = model.transform(test)

    predictionAndLabels = predictions.rdd.map(lambda lp:
                                              (lp.prediction, lp.label))
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Overall Stats:")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Weighted stats
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted F1 Score = %s" % metrics.weightedFMeasure())

    # Statistics by class
    print("Stats by class")

    for (genre, label) in predictions.select('genre',
                                             'label').distinct().collect():
        print("Class %s precision = %s" % (genre, metrics.precision(label)))
        print("Class %s recall = %s" % (genre, metrics.recall(label)))
        print("Class %s F1 Score = %s" %
              (genre, metrics.fMeasure(label, beta=1.0)))

コード例 #7

0

ファイルを表示

def multi_clf_performance(name, method, train, test):
    model = method.fit(train)
    prediction = model.transform(test)
    print(f"-----------Performance of {name} on testing set-----------")
    # Compute raw scores on the test set
    predictionAndLabels = prediction.select('prediction', 'label')
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels.rdd)
    # Overall statistics
    print("----------Summary Stats----------------------")
    print(f"Weighted precision: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'weightedPrecision'})}")
    print(f"Weighted recall: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'weightedRecall'})}")
    print(f"F1 Score: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'f1'})}")
    print(f"Accuracy: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'accuracy'})}")

    # Statistics by class
    print("--------Stats by class----------------------")
    labels = [row.asDict()['label'] for row in test.select('label').distinct().collect()]
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Score = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    #print("--------Weighted Stats----------------------")
    #print("Weighted precision = %s" % metrics.weightedPrecision)
    #print("Weighted recall = %s" % metrics.weightedRecall) 
    #print("Weighted F1 Score = %s" % metrics.weightedFMeasure())
    #print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    print("-----------------------------------------------------------")

コード例 #8

0

ファイルを表示

    def calculate_metrics(self, df):
        """

    define your own metrics to evaluate cross validation

    :params:

    df: dataframe containing {aprediction} and {label} columns

    :returns:

    confusion matrix

    """

        # turn gt into label
        preds_and_labels = df.select('prediction',
                                     f.col('label').cast(t.FloatType()))
        metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

        # confusion matrix
        metrics_dict = dict(
            # unweighted measures
            tpr=metrics.truePositiveRate(label=1.0),
            fpr=metrics.falsePositiveRate(label=1.0),
            precision=metrics.precision(label=1.0),
            recall=metrics.recall(label=1.0),
            fMeasure=metrics.fMeasure(label=1.0))

        metrics_dict = {
            k: round(v, 3) if k != "confusion" else v
            for k, v in metrics_dict.items()
        }

        return metrics_dict

コード例 #9

0

ファイルを表示

def evaluate(df, labelCols, gettopX=-1, getfirstX=-1):
    labelCols2 = [i + "_pred" for i in labelCols]
    df.cache()

    r_list = {
        i: np.zeros((len(labelCols)))
        for i in ['accuracy', 'precision', 'recall', 'fmeasure']
    }
    for i in xrange(len(labelCols)):
        predandlabels = df.select(labelCols2[i], labelCols[i]).rdd \
                        .map(lambda x: (float(x[labelCols2[i]]), float(x[labelCols[i]])))
        metrics = MulticlassMetrics(predandlabels)

        # print metrics.confusionMatrix()
        r_list['accuracy'][i] = metrics.accuracy
        r_list['precision'][i] = metrics.precision(1.0)
        r_list['recall'][i] = metrics.recall(1.0)
        r_list['fmeasure'][i] = metrics.fMeasure(label=1.0)

    results = {}
    for m, rs in r_list.iteritems():
        results[m] = np.mean(rs)

    for code, num in [('top', gettopX), ('first', getfirstX)]:
        if num <= 0: continue

        if code == 'top':
            idx = np.argsort(np.nan_to_num(r_list['fmeasure']))[-num:]
        elif code == 'first':
            idx = xrange(num)

        for m, rs in r_list.iteritems():
            results['{0}_{1}'.format(m, code)] = np.mean(rs[idx])

    return results

コード例 #10

0

ファイルを表示

ファイル: skeleton.py プロジェクト: jonathanshuai/pyspark-notes

def print_performance_metrics(predictions):
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    auc = evaluator.evaluate(predictions,
                             {evaluator.metricName: "areaUnderROC"})
    aupr = evaluator.evaluate(predictions,
                              {evaluator.metricName: "areaUnderPR"})
    print("auc = {}".format(auc))
    print("aupr = {}".format(aupr))

    # Get RDD of predictions and labels for eval metrics
    predictionAndLabels = predictions.select("prediction", "label").rdd

    # Instantiate metrics objects
    binary_metrics = BinaryClassificationMetrics(predictionAndLabels)
    multi_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = {}".format(binary_metrics.areaUnderPR))
    # Area under ROC curve
    print("Area under ROC = {}".format(binary_metrics.areaUnderROC))
    # Accuracy
    print("Accuracy = {}".format(multi_metrics.accuracy))
    # Confusion Matrix
    print(multi_metrics.confusionMatrix())
    # F1
    print("F1 = {}".format(multi_metrics.fMeasure(1.0)))
    # Precision
    print("Precision = {}".format(multi_metrics.precision(1.0)))
    # Recall
    print("Recall = {}".format(multi_metrics.recall(1.0)))
    # FPR
    print("FPR = {}".format(multi_metrics.falsePositiveRate(1.0)))
    # TPR
    print("TPR = {}".format(multi_metrics.truePositiveRate(1.0)))

コード例 #11

0

ファイルを表示

ファイル: hw3.py プロジェクト: MrMouse2019/Big-Data-HW

def getF1Score(model, test_df):
    pred = model.transform(test_df)
    pl = pred.select("label", "prediction").rdd.cache()
    metrics = MulticlassMetrics(pl)
    f1score = metrics.fMeasure()
    print("the F1-score of the model is : {}".format(f1score))
    return f1score

コード例 #12

0

ファイルを表示

ファイル: sparkMLib_decisionTree.py プロジェクト: Junaid112/Big_Data_Analytics

def printMeasurementMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print('Precision Result of setosa: ', metrics.precision(1))
    print('Precision Result of versicolor:', metrics.precision(2))
    print('Precision Result of virginica:', metrics.precision(3))
    print('F-1 Score:         ', metrics.fMeasure())
    print('Confusion Matrix\n', metrics.confusionMatrix().toArray())

コード例 #13

0

ファイルを表示

def evaluate(model, word_column="words", vectorizer="w2v"):
    doc2vecs_df = featurize(word_column, vectorizer)
    if type(model) == LinearSVC:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.regParam, [0.1]) \
            .build()
    elif type(model) == GBTClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [50]) \
            .build()
    elif type(model) == RandomForestClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxBins, [100]) \
            .build()
    elif type(model) == MultilayerPerceptronClassifier:
        paramGrid = ParamGridBuilder() \
             .addGrid(model.layers, [[122, 50, 2]]) \
             .build()
        # .addGrid(model.layers, [[120, 2], [120, 50, 2], [120, 75, 50, 2]]) \
    elif type(model) == FMClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.stepSize, [.01, .001]) \
            .build()
    print('Evaluating...')
    w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])
    si = StringIndexer(inputCol="LABEL", outputCol="label")
    model_evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="f1")
    classifier_pipeline = Pipeline(stages=[si, model])
    crossval = CrossValidator(estimator=classifier_pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=model_evaluator,
                              numFolds=5)
    fit_model = crossval.fit(doc2vecs_df)
    predictions = fit_model.transform(w2v_test_df)
    # predictions.toPandas().to_csv('predictions.csv')
    # predictions.groupBy('prediction', 'label', 'PRODUCT_CATEGORY')
    # predictions.describe()
    summarizer = Summarizer.metrics("mean", "count")
    predictions.select(
        summarizer.summary(predictions.filter(
            predictions.label == 1).pos)).show(truncate=False)
    preds_and_labels = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
    print('Confusion Matrix')
    print(metrics.confusionMatrix().toArray())
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    accuracy = model_evaluator.evaluate(predictions)
    trainingSummary = fit_model.bestModel.stages[-1].extractParamMap()
    print(trainingSummary)

    return accuracy

コード例 #14

0

ファイルを表示

def printMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print 'Precision of True ', metrics.precision(1)
    print 'Precision of False', metrics.precision(0)
    print 'Recall of True    ', metrics.recall(1)
    print 'Recall of False   ', metrics.recall(0)
    print 'F-1 Score         ', metrics.fMeasure()
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()

コード例 #15

0

ファイルを表示

ファイル: test_mllib.py プロジェクト: Great1414/pyspark_learn

def printMetrics(result):
    metrics = MulticlassMetrics(result)
    print("\nPrecision of True\n", metrics.precision(1))
    print("\nPrecision of False\n", metrics.precision(0))
    print("\nRecall of True\n", metrics.recall(1))
    print("\nRecall of False\n", metrics.recall(0))
    print("\nF1 score\n", metrics.fMeasure())
    print("\nConfusion Matrix\n", metrics.confusionMatrix().toArray())

コード例 #16

0

ファイルを表示

def evaluate(df_prediction):
    evaluator = BinaryClassificationEvaluator()
    rc = evaluator.evaluate(df_prediction, {evaluator.metricName: "areaUnderROC"})
    pr = evaluator.evaluate(df_prediction, {evaluator.metricName: "areaUnderPR"})
    predictionRDD = df_prediction.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)
    f1 = metrics.fMeasure()
    return [roc,pr,f1]

コード例 #17

0

ファイルを表示

def displayMetrics(pred):
    ev = MulticlassMetrics(pred.select(["label", "prediction"]).rdd)

    # Overall statistics
    print("Accuracy = %s" % ev.accuracy)
    print("Precision = %s" % ev.precision())
    print("Recall = %s" % ev.recall())
    print("F1 Score = %s" % ev.fMeasure())

コード例 #18

0

ファイルを表示

ファイル: supervised_test.py プロジェクト: peeyushster/Machine-Learning

def main(spark, model_file, data_file):
    '''Main routine for supervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    # Load data.
    dataset = spark.read.parquet(data_file)

    # Load  model.
    model = PipelineModel.load(model_file)

    prediction = model.transform(dataset)

    predictionAndLabels = prediction.select(["prediction", "label"]).rdd

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    print("\n")
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("\n")
    labels = predictionAndLabels.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" %
              (label, metrics.fMeasure(label, beta=1.0)))

コード例 #19

0

ファイルを表示

def evaluate(pl):  # input predictionsAndLabels
    testErr = predictionsAndLabels.filter(
        lambda lp: lp[0] != lp[1]).count() / float(predictionsAndLabels.count())
    metrics = MulticlassMetrics(predictionsAndLabels)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    return testErr, precision, recall, f1Score

コード例 #20

0

ファイルを表示

ファイル: random_forest_mllib.py プロジェクト: Ulitochka/ASDM_labs

def metrics_basic(data):
    metrics = MulticlassMetrics(data)
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

コード例 #21

0

ファイルを表示

    def printMetrics(self, preds, prediction="indexedLabel", indexedLabel="prediction"):
        metrics = MulticlassMetrics(preds.select(prediction, indexedLabel).rdd)

        labels = [0, 1]
        for label in sorted(labels):
            try:
                print("Class %s precision = %s" % (label, metrics.precision(label)))
                print("Class %s recall = %s" % (label, metrics.recall(label)))
                print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
            except:
                print("No malicious predictions")

コード例 #22

0

ファイルを表示

ファイル: stage-one-v2.py プロジェクト: zzeqii/Apache-spark-machine-learning

def classification_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate calculate class level metrics
    metrics = MulticlassMetrics(prediction_and_labels)
    classes = set(actual_data.rdd.map(lambda x: x.test_label[0]).collect())
    print('Class\tPrecision\tRecall\tF-Score')
    for c in sorted(classes):
        print('{}\t{}\t{}\t{}'.format(c, round(metrics.precision(c), 3),
                                      round(metrics.recall(c), 3),
                                      round(metrics.fMeasure(c), 3)))

コード例 #23

0

ファイルを表示

ファイル: train_model.py プロジェクト: Danahirmt/Proyecto-dpa-RITA

def evaluate(predictionAndLabels):
    log = {}

    # Show Validation Score (AUROC)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['AUROC'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under ROC = {}".format(log['AUROC']))

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['AUPR'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under PR = {}".format(log['AUPR']))

    # Metrics
    predictionRDD = predictionAndLabels.select(['label', 'prediction']) \
                            .rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Confusion Matrix
    print(metrics.confusionMatrix().toArray())

    # Overall statistics
    log['precision'] = "%s" % metrics.precision()
    log['recall'] = "%s" % metrics.recall()
    log['F1 Measure'] = "%s" % metrics.fMeasure()
    print("[Overall]\tprecision = %s | recall = %s | F1 Measure = %s" % \
            (log['precision'], log['recall'], log['F1 Measure']))

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        log[label] = {}
        log[label]['precision'] = "%s" % metrics.precision(label)
        log[label]['recall'] = "%s" % metrics.recall(label)
        log[label]['F1 Measure'] = "%s" % metrics.fMeasure(label, beta=0.5)
        print("[Class %s]\tprecision = %s | recall = %s | F1 Measure = %s" \
                  % (label, log[label]['precision'],
                    log[label]['recall'], log[label]['F1 Measure']))

    return log

コード例 #24

0

ファイルを表示

ファイル: meteos-script-1.6.0.py プロジェクト: ncarkaci/meteos

    def evaluateClassification(self, predictionAndLabels):

        metrics = MulticlassMetrics(predictionAndLabels)
        cm = metrics.confusionMatrix()

        result = {}

        result['Matrix'] = cm.toArray().tolist()
        result['Precision'] = metrics.precision()
        result['Recall'] = metrics.recall()
        result['F1 Score'] = metrics.fMeasure()

        return result

コード例 #25

0

ファイルを表示

def overall_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate actual / predicted labels in rdd from
    metrics = MulticlassMetrics(prediction_and_labels)

    # Calculate overall level metrics
    # print('Precision:', metrics.precision(), type(metrics.precision()))
    return sc.parallelize([
        (Vectors.dense(metrics.precision()), Vectors.dense(metrics.recall()),
         Vectors.dense(metrics.fMeasure()))
    ]).toDF(['Precision', 'Recall', 'F-Score'])

コード例 #26

0

ファイルを表示

ファイル: test_bayes_saved_model.py プロジェクト: varsha-varadarajan/sparksentimeter

def main():
    sc = SparkContext(appName="BayesClassifer")
    htf = HashingTF(50000)
    data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels.csv')
    data_cleaned = data.map(lambda line : line.split(","))
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(label, htf.transform(text)))
    data_hashed.persist()
    # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    #print data
    # Split data aproximately into training (60%) and test (40%)
    training, test = data_hashed.randomSplit([0.70, 0.30], seed=0)

    sameModel = NaiveBayesModel.load(sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel")

    print "----------"
    print sameModel.predict(htf.transform("posts jump in net profit"))

    predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
    predictionAndLabel1 = training.map(lambda p: (sameModel.predict(p.features), p.label))
    prediction = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    prediction1 = 1.0 * predictionAndLabel1.filter(lambda (x, v): x == v).count() / training.count()
    buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v ==1).count()


    # Instantiate metrics object
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabel)

    # Overall statistics
    precision = metrics.precision()
    precision = normalize(precision)
    recall = metrics.recall()
    recall = normalize(recall)
    f1Score = metrics.fMeasure()
    f1Score = normalize(f1Score)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    '''
    # Statistics by class
    labels = data_hashed.map(lambda lp: lp.label).distinct().collect()

    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    '''
    '''

コード例 #27

0

ファイルを表示

def classification_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate calculate class level metrics
    metrics = MulticlassMetrics(prediction_and_labels)
    classes = set(actual_data.rdd.map(lambda x: x.test_labels[0]).collect())
    results = [(Vectors.dense(float(c)),
                Vectors.dense(round(metrics.precision(c), 3)),
                Vectors.dense(round(metrics.recall(c), 3)),
                Vectors.dense(round(metrics.fMeasure(c), 3)))
               for c in sorted(classes)]
    return sc.parallelize(results).toDF(
        ['Class', 'Precision', 'Recall', 'F-Score'])

コード例 #28

0

ファイルを表示

ファイル: retrain.py プロジェクト: tristaneljed/amazon-kinesis-sagemaker-promotion-recommendations

def validate_tffm(spark, sc, model, test_df, s3_metrics_path, s3_endpoint_path):
    # get predictions
    validation_df = model.transform(test_df)
    
    metricsSchema = StructType() \
        .add("metric", StringType()) \
        .add("value", DoubleType())
    metrics_names = []

    # apply threshold
    def thresholdScore(x):
        retval = 0.0
        if x > 0.5:
            retval = 1.0
        return retval
    
    thresholdScoreUdf = F.UserDefinedFunction(thresholdScore, T.FloatType())
    
    validation_df_round = validation_df.withColumn('rscore', thresholdScoreUdf(validation_df.score)) 
    predTffm = validation_df_round.select(['label','rscore'])

    predictionAndLabelsTffm = predTffm.rdd.map(lambda lp: (lp.rscore, lp.label))
    metricsTffm = BinaryClassificationMetrics(predictionAndLabelsTffm)

    metrics_names.append(("Area_under_PR",metricsTffm.areaUnderPR))
    metrics_names.append(("Area_under_ROC",metricsTffm.areaUnderROC))

    mmetricsTffm = MulticlassMetrics(predictionAndLabelsTffm)
    metrics_names.append(("Precision",mmetricsTffm.precision()))
    metrics_names.append(("Recall",mmetricsTffm.recall()))
    metrics_names.append(("F1",mmetricsTffm.fMeasure()))
    metrics_names.append(("Weighted_recall",mmetricsTffm.weightedRecall))
    metrics_names.append(("Weighted_precision",mmetricsTffm.weightedPrecision))
    metrics_names.append(("Weighted_F1",mmetricsTffm.weightedFMeasure()))
    metrics_names.append(("Weighted_F05",mmetricsTffm.weightedFMeasure(beta=0.5)))
    metrics_names.append(("Weighted_FP_rate",mmetricsTffm.weightedFalsePositiveRate))

    mRdd = sc.parallelize(metrics_names).coalesce(1)
    dfMetrics = spark.createDataFrame(mRdd, metricsSchema)
    dfMetrics.write.csv("{0}/{1}".format(s3_metrics_path, model.endpointName), mode="overwrite")

    endpointSchema = StructType() \
        .add("time", StringType()) \
        .add("endpoint", StringType())
    endpoint_name = []
    endpoint_name.append((str(time.time()),str(model.endpointName)))
    eRdd = sc.parallelize(endpoint_name).coalesce(1)
    dfEndpoint = spark.createDataFrame(eRdd, endpointSchema)
    dfEndpoint.write.csv("{0}/endpoint.txt".format(s3_endpoint_path), mode="overwrite")

コード例 #29

0

ファイルを表示

ファイル: Json.py プロジェクト: honeycombcmu/SparkService

def generateJson(AlgorithmName, taskid, traindata, predictionAndLabels):
	jsonContent = dict()
	jsonContent['AlgorithmName'] = AlgorithmName
	jsonContent['TaskId'] = taskid

	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	jsonContent['LabelNum'] = len(labels)

	metrics = MulticlassMetrics(predictionAndLabels)
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	confusion_matrix = metrics.confusionMatrix().toArray()

	jsonContent['Precision'] = precision
	jsonContent['Recall'] = recall
	jsonContent['F1Score'] = f1Score
	jsonContent['ConfusionMatrix'] = confusion_matrix.tolist()

	jsonContent['Labels'] = list()
	for label in sorted(labels):
		tempList = dict()
		tempList['Precision'] = metrics.precision(label)
		tempList['Recall'] = metrics.recall(label)
		tempList['F1Measure'] = metrics.fMeasure(label, beta=1.0)

		jsonContent['Labels'].append(tempList)
	
	jsonContent['WeightedStats'] = dict()
	jsonContent['WeightedStats']['Precision'] = metrics.weightedRecall
	jsonContent['WeightedStats']['F1Score'] = metrics.weightedFMeasure()
	jsonContent['WeightedStats']['FalsePositiveRate'] = metrics.weightedFalsePositiveRate

	with open(taskid + '.json', 'w') as jsonFile:
		json.dump(jsonContent, jsonFile, indent=4, separators=(',', ': '))
		jsonFile.flush()

コード例 #30

0

ファイルを表示

def performance(predictions):
    predictionRDD = predictions.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
        
    binmetrics = BinaryClassificationMetrics(predictionRDD)
    metrics = MulticlassMetrics(predictionRDD)
    
    results = {'predictions':predictions,
               'areaUnderROC':binmetrics.areaUnderROC,
               'areaUnderPR':binmetrics.areaUnderPR,
               'confusionMatrix':metrics.confusionMatrix().toArray(),
               'accuracy':metrics.accuracy,
               'precision':metrics.precision(),
               'recall':metrics.recall(),
               'f1measure':metrics.fMeasure()}
    
    return results

コード例 #31

0

ファイルを表示

ファイル: stage-one-v2.py プロジェクト: zzeqii/Apache-spark-machine-learning

def overall_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate actual / predicted labels in rdd from
    metrics = MulticlassMetrics(prediction_and_labels)

    # Calculate overall level metrics
    # print('Precision:', metrics.precision(), type(metrics.precision()))
    # return sc.parallelize([(Vectors.dense(metrics.accuracy),
    #                         Vectors.dense(metrics.precision()),
    #                         Vectors.dense(metrics.recall()),
    #                         Vectors.dense(metrics.fMeasure()))]).toDF(['Accuracy', 'Precision', 'Recall', 'F - Score'])
    print('Accuracy\tPrecision\tRecall\tF-Score')
    print('{}\t{}\t{}\t{}'.format(metrics.accuracy, metrics.precision(),
                                  metrics.recall(), metrics.fMeasure()))

コード例 #32

0

ファイルを表示

 def performancerdd(self):
     self.calculator = 'RDDs'
     print('Calculating performance metrics using RDDs...')
     predictionRDD = self.predictions.select(['label','prediction']).rdd.map(lambda line: (line[1],line[0]))
     
     binmetrics = BinaryClassificationMetrics(predictionRDD)
     metrics = MulticlassMetrics(predictionRDD)
     
     self.areaUnderROC = binmetrics.areaUnderROC
     self.areaUnderPR = binmetrics.areaUnderPR
     self.confusionMatrix = metrics.confusionMatrix().toArray()
     self.accuracy = metrics.accuracy
     self.precision = metrics.precision()
     self.recall = metrics.recall()
     self.f1measure = metrics.fMeasure()
     self.falsePositive = metrics.falsePositiveRate(1.0)
     self.falseNegative = metrics.falsePositiveRate(0.0)

コード例 #33

0

ファイルを表示

def printFinalResultMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print '\n'
    print 'Precision of Setosa ', metrics.precision(1)
    print 'Precision of Versicolor', metrics.precision(2)
    print 'Precision of Virginica', metrics.precision(3)
    print '\n'
    print 'Recall of Setosa    ', metrics.recall(1)
    print 'Recall of Versicolor   ', metrics.recall(2)
    print 'Recall of Virginica   ', metrics.recall(3)

    print '\n'
    print 'F-1 Score         ', metrics.fMeasure()
    print '\n\n'
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()

    print '\n\n'
    return

コード例 #34

0

ファイルを表示

ファイル: main.py プロジェクト: GuruTeja/iHear-Server

def modelStatistics(labelsAndPredictions):
    metrics = MulticlassMetrics(labelsAndPredictions)
    print(metrics.confusionMatrix())

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

コード例 #35

0

ファイルを表示

ファイル: jordan_hive_spark.py プロジェクト: arifyali/Yelp

model = SVMWithSGD.train(trainParsed, iterations=100)

# Training Error
trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count())
print trainErr

# Test Error
testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count())
print testErr

metrics = BinaryClassificationMetrics(testLabelsAndPreds)

print metrics.areaUnderROC
print metrics.areaUnderPR

mcMetrics = MulticlassMetrics(testLabelsAndPreds)

#TODO: Do this for classes 1.0,0.0 and not just overall
print mcMetrics.precision()
print mcMetrics.recall()
print mcMetrics.fMeasure()

model.save(sc, "SVMModel")

### Run Model on Validation Set
## TODO: output file of zipcodes and predicted success metrics
## TODO: Use bokeh on file to make visualization of the US

コード例 #36

0

ファイルを表示

ファイル: evaluate.py プロジェクト: stevencox/chemotext

    def train_model (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        labeled = Evaluate.load_all (sc, conf). \
                  map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0,
                                                 features = [ b.paraDist, b.sentDist, b.docDist ] ) )

#        labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \
#                  map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0,
#                                                 [ b, b * 2, b * 9 ] ) )
#        print (labeled.collect ())

        train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345)

        count = train.count ()
        start = time.time ()
        model = LogisticRegressionWithLBFGS.train (train)
        elapsed = time.time () - start
        print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed))

        start = time.time ()
        model_path = os.path.join (conf.output_dir, "eval", "model")
        file_path = model_path.replace ("file://", "")
        if os.path.isdir (file_path):
            print ("Removing existing model {0}".format (file_path))
            shutil.rmtree (file_path)
        model.save(sc, model_path)
        sameModel = LogisticRegressionModel.load(sc, model_path)
        elapsed = time.time () - start
        print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed))


        # Metrics
        labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count())
        print("Training Error => {0}".format (trainErr))

        predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) ))
        metrics = MulticlassMetrics (predictionsAndLabels) 
        print (" --------------> {0}".format (predictionsAndLabels.take (1000)))

        #print (labelsAndPreds.collect ())
        print ("\nMETRICS:")
        try:
            print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0)))
            print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("precision          : {0}".format (metrics.precision(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("recall             : {0}".format (metrics.recall(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("fMeasure           : {0}".format (metrics.fMeasure(0.0, 2.0)))
        except:
            traceback.print_exc ()

        print ("confusion matrix   : {0}".format (metrics.confusionMatrix().toArray ()))
        print ("precision          : {0}".format (metrics.precision()))
        print ("recall             : {0}".format (metrics.recall()))
        print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate))
        print ("weighted precision : {0}".format (metrics.weightedPrecision))
        print ("weighted recall    : {0}".format (metrics.weightedRecall))
        print ("weight f measure   : {0}".format (metrics.weightedFMeasure()))
        print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0)))
        print ("")

        # Regression metrics
        predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) )

        regression_metrics = RegressionMetrics (predictedAndObserved)
        print ("explained variance......: {0}".format (regression_metrics.explainedVariance))
        print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError))
        print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError))
        print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError))
        print ("r2......................: {0}".format (regression_metrics.r2))
        print ("")

        labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features)))
        testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ())
        print ("Testing Error => {0}".format (testErr))

コード例 #37

0

ファイルを表示

ファイル: multi_class_metrics_example.py プロジェクト: lhfei/spark-in-action

    training, test = data.randomSplit([0.6, 0.4], seed=11)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=3)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)

コード例 #38

0

ファイルを表示

ファイル: logistic_regression.py プロジェクト: honeycombcmu/SparkService

def logisticRegression(trainFile, testFile, taskid, sc):
	# Load training data in LIBSVM format
	trainData = MLUtils.loadLibSVMFile(sc, trainFile)
	testData = MLUtils.loadLibSVMFile(sc, testFile)

	# Split data into training (60%) and test (40%)
	# traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	# traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	labelNum = trainData.map(lambda lp: lp.label).distinct().count()

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(trainData, numClasses=labelNum)

	# Compute raw scores on the test set
	predictionAndLabels = testData.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	Json.generateJson("LogisticRegression", taskid, trainData, predictionAndLabels);
	# Instantiate metrics object
	metrics = MulticlassMetrics(predictionAndLabels)

	# Overall statistics
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	#confusion_matrix = metrics.confusionMatrix().toArray()

	print("Summary Stats")
	print("Precision = %s" % precision)
	print("Recall = %s" % recall)
	print("F1 Score = %s" % f1Score)


	# Statistics by class
	labels = trainData.map(lambda lp: lp.label).distinct().collect()
	for label in sorted(labels):
	    print("Class %s precision = %s" % (label, metrics.precision(label)))
	    print("Class %s recall = %s" % (label, metrics.recall(label)))
	    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

	# Weighted stats
	print("Weighted recall = %s" % metrics.weightedRecall)
	print("Weighted precision = %s" % metrics.weightedPrecision)
	print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
	print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
	print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

	# #return model parameters
	# res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
	# 	   ('3','Yes','Precision', metrics.precision(0.0)),
	# 	   ('4','Yes','Recall', metrics.recall(0.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
	#        ('3','Yes','Precision', metrics.precision(1.0)),
	# 	   ('4','Yes','Recall', metrics.recall(1.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
	#        ('3','Yes','Precision', metrics.precision(2.0)),
	#        ('4','Yes','Recall', metrics.recall(2.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]

	# #save output file path as JSON and dump into dumpFilePath
	# rdd = sc.parallelize(res)
	# SQLContext.createDataFrame(rdd).collect()
	# df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value'])

	#tempDumpFilePath = dumpFilePath + "/part-00000"
	#if os.path.exists(tempDumpFilePath):
	#	os.remove(tempDumpFilePath)

	#df.toJSON().saveAsTextFile(hdfsFilePath)
	#tmpHdfsFilePath = hdfsFilePath + "/part-00000"
	#subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath])

	# Save and load model
	#clusters.save(sc, "myModel")
	#sameModel = KMeansModel.load(sc, "myModel")

コード例 #39

0

ファイルを表示

ファイル: multi_class_1.py プロジェクト: mon95/Kaggle-Scripts

    training, test = data.randomSplit([0.85, 0.15], seed=11L)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=39)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)
    
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    #accuracy = metrics.accuracy
    accuracy = 1.0 * predictionAndLabels.filter(lambda (x, v): x == v).count() / test.count()
    # print("Summary Stats")
    # print("Precision = %s" % precision)
    # print("Recall = %s" % recall)
    # print("F1 Score = %s" % f1Score)    
    # print("Accuracy = %s" % accuracy)
    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    # for label in sorted(labels):
    #     print("Class %s precision = %s" % (label, metrics.precision(label)))
    #     print("Class %s recall = %s" % (label, metrics.recall(label)))
    #     print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats

コード例 #40

0

ファイルを表示

ファイル: cf_example.py プロジェクト: Lomascolo/hermes

def test_prfs():
    # TODO: revised so that it will take user's inputs instead of hardcoded values
    
    """
    Test Precision, Recall, Fscore, and Support on multiclass classification data
    Input data: https://github.com/apache/spark/blob/master/data/mllib/sample_multiclass_classification_data.txt.
    """

    # load the schemas (if existed)

    # create a hdfs directory
    #os.system("hdfs dfs -mkdir datasets")

    # load the data file into the hdfs directory
    os.system("hdfs dfs -put sample_multiclass_classification_data.txt datasets/sample_multiclass_classification_data.txt")
    data = MLUtils.loadLibSVMFile(scsingleton.sc, "hdfs://localhost:9000/datasets/sample_multiclass_classification_data.txt")
   
    # print data.take(1)
    # ie. [LabeledPoint(1.0, (4,[0,1,2,3],[-0.222222,0.5,-0.762712,-0.833333]))] 
    # [ ( finalClassification, (numLabels, [label0, label1, label2, ..., labelN], [prob0, prob1, prob2, ..., probN]) ) ]

    # split data into train (60%), test (40%)
    trainingRDD, testRDD = data.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    with Timer() as t:
        numTest = testRDD.count()
    print "testRDD.count(): %s seconds" % t.secs

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3)
    print "LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredAndLabel = testRDD.map(lambda lp: (float(model.predict(lp.features)), lp.label))
    print "testPredAndLabel: %s seconds" % t.secs

    # calculate Precision, Recall, F1-score
    metrics = MulticlassMetrics(testPredAndLabel)
    print( "precision = %s" % metrics.precision() )
    print( "recall = %s" % metrics.recall() )
    print( "f1-score = %s" % metrics.fMeasure() )

    # statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print( "Class %s precision = %s" % (label, metrics.precision(label)) )
        print( "Class %s recall = %s" % (label, metrics.recall(label)) )
        print( "Class %s f1-score = %s" % (label, metrics.fMeasure(label, beta=1.0)) )

    # weighted stats
    print( "Weighted precision = %s" % metrics.weightedPrecision )
    print( "Weighted recall = %s" % metrics.weightedRecall )
    print( "Weighted f1-score = %s" % metrics.weightedFMeasure() )
    print( "Weighted f(0.5)-score = %s" % metrics.weightedFMeasure(beta=0.5) )
    print( "Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate )
    
    return

コード例 #41

0

ファイルを表示

ファイル: PySpark.py プロジェクト: boweiz/SparkService

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)

	# Compute raw scores on the test set
	predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	# Instantiate metrics object
	metrics = MulticlassMetrics(predictionAndLabels)

	# Overall statistics
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	#confusion_matrix = metrics.confusionMatrix().toArray()

	print("Summary Stats")
	print("Precision = %s" % precision)
	print("Recall = %s" % recall)
	print("F1 Score = %s" % f1Score)


	# Statistics by class
	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	for label in sorted(labels):
	    print("Class %s precision = %s" % (label, metrics.precision(label)))
	    print("Class %s recall = %s" % (label, metrics.recall(label)))
	    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))