Python MulticlassMetrics.fMeasure Exemples, pyspark.mllib.evaluation.MulticlassMetrics.fMeasure Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : GdeltDecisionTree.py Projet : liber-pater/ProjectThales

def evaluate(predictions):
    """
    Evaluation Metrics
    """
    # label to indexedLabel mappings
    # out = sorted(set([(i[0], i[1]) for i in predictions.select(predictions.label, predictions.indexedLabel).collect()]), key=lambda x: x[0])

    print "Predictions"
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and evaluate model
    predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    # Statistics by class
    labels = predictions.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    treeModel = model.stages[2]
    print treeModel # summary only

Exemple #2

0

Afficher le fichier

Fichier : GdeltDecisionTree-mllib.py Projet : liber-pater/ProjectThales

def evaluate(labelsAndPredictions, data, labels):
    """
    Evaluation Metrics
    """
    # Instantiate metrics object
    metrics = MulticlassMetrics(labelsAndPredictions)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    # Statistics by class
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Exemple #3

0

Afficher le fichier

def NaiveBayesEvaluation(TransformedDataset):

    nb = NaiveBayes()
    nb.setLabelCol("LabelIndex")
    nb.setPredictionCol("Label_Prediction")
    training, test = TransformedDataset.randomSplit([0.8, 0.2], seed=11)
    nvModel = nb.fit(training)
    prediction = nvModel.transform(test)

    # selected = prediction.select("body", "LabelIndex", "label", "Label_Prediction")
    # for row in selected.collect():
    #     print(row)

    from pyspark.mllib.evaluation import MulticlassMetrics

    predictionAndLabels = prediction.select(
        "Label_Prediction",
        "LabelIndex").rdd.map(lambda r: (float(r[0]), float(r[1])))

    # predictionAndLabels = test.rdd.map(lambda lp: (float(nvModel.predict(lp.features)), lp.label))
    metrics = MulticlassMetrics(predictionAndLabels)

    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Statistics by class
    labels = prediction.rdd.map(lambda lp: lp.label).distinct().collect()
    labelIndices = prediction.rdd.map(
        lambda lp: lp.LabelIndex).distinct().collect()
    labelIndicesPairs = prediction.rdd.map(
        lambda lp: (lp.label, lp.LabelIndex)).distinct().collect()

    print("Labels", labels)
    print("Label Indices", labelIndices)
    print("Label Indice Pairs", labelIndicesPairs)

    for label, labelIndex in sorted(labelIndicesPairs):
        print("\n Class %s precision = %s" %
              (label, metrics.precision(labelIndex)))
        print("Class %s recall = %s" % (label, metrics.recall(labelIndex)))
        print(
            "Class %s F1 Measure = %s" %
            (label, metrics.fMeasure(labelIndex, beta=1.0)), "\n")

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" %
          metrics.weightedFalsePositiveRate)

Exemple #4

0

Afficher le fichier

Fichier : pyspark_helpers.py Projet : roitraining/j_hadoop

def evaluate_predictions(predictions, show=True):
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
    log = {}

    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['auroc'] = evaluator.evaluate(predictions)

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['aupr'] = evaluator.evaluate(predictions)

    # Metrics
    predictionRDD = predictions.select(
        ['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Overall statistics
    log['precision'] = metrics.precision()
    log['recall'] = metrics.recall()
    log['F1 Measure'] = metrics.fMeasure()

    # Statistics by class
    distinctPredictions = collect_tuple(
        predictions.select('prediction').distinct())
    for x in sorted(distinctPredictions):
        log[x] = {}
        log[x]['precision'] = metrics.precision(x)
        log[x]['recall'] = metrics.recall(x)
        log[x]['F1 Measure'] = metrics.fMeasure(x, beta=1.0)

    # Confusion Matrix
    log['cm'] = metrics.confusionMatrix().toArray()
    log['cmpercent'] = cm_percent(log['cm'], predictions.count(), show)

    if show:
        show_predictions(predictions)

        print('Confusion Matrix')
        print(' TP', 'FN\n', 'FP', 'TN')
        print(log['cm'])
        print(' PC', 'FN\n', 'FP', 'PW')
        print(log['cmpercent'])
        print('')
        print("Area under ROC = {}".format(log['auroc']))
        print("Area under AUPR = {}".format(log['aupr']))
        print('\nOverall\ntprecision = {}\nrecall = {}\nF1 Measure = {}\n'.
              format(log['precision'], log['recall'], log['F1 Measure']))

        for x in sorted(distinctPredictions):
            print('Label {}\ntprecision = {}\nrecall = {}\nF1 Measure = {}\n'.
                  format(x, log[x]['precision'], log[x]['recall'],
                         log[x]['F1 Measure']))

    return log

Exemple #5

0

Afficher le fichier

def printMetrics(predictions_and_labels, output_file):
   metrics = MulticlassMetrics(predictions_and_labels)
   output_file.write('Precision of True '+str(metrics.precision(1))+'\n')
   output_file.write('Precision of False' + str(metrics.precision(0))+'\n')
   output_file.write('Recall of True  '+str(metrics.recall(1))+'\n')
   output_file.write('Recall of False   '+str(metrics.recall(0))+'\n')
   output_file.write('F-1 Score         '+str(metrics.fMeasure())+'\n')
   output_file.write('Confusion Matrix\n'+str(metrics.confusionMatrix().toArray())+'\n')

   print('Precision of True '+str(metrics.precision(1)))
   print('Precision of False'+str(metrics.precision(0)))
   print('Recall of True  '+str(metrics.recall(1)))
   print('Recall of False   '+str(metrics.recall(0)))
   print('F-1 Score         '+str(metrics.fMeasure()))
   print('Confusion Matrix\n'+str(metrics.confusionMatrix().toArray()))

Exemple #6

0

Afficher le fichier

def main(spark, model_file, data_file):
    '''Main routine for supervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    ###
    # TODO: YOUR CODE GOES HERE
    #load best lr model
    model = PipelineModel.load(model_file)
    # Load the test dataframe
    test = spark.read.parquet(data_file)

    predictions = model.transform(test)

    predictionAndLabels = predictions.rdd.map(lambda lp:
                                              (lp.prediction, lp.label))
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Overall Stats:")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Weighted stats
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted F1 Score = %s" % metrics.weightedFMeasure())

    # Statistics by class
    print("Stats by class")

    for (genre, label) in predictions.select('genre',
                                             'label').distinct().collect():
        print("Class %s precision = %s" % (genre, metrics.precision(label)))
        print("Class %s recall = %s" % (genre, metrics.recall(label)))
        print("Class %s F1 Score = %s" %
              (genre, metrics.fMeasure(label, beta=1.0)))

Exemple #7

0

Afficher le fichier

def multi_clf_performance(name, method, train, test):
    model = method.fit(train)
    prediction = model.transform(test)
    print(f"-----------Performance of {name} on testing set-----------")
    # Compute raw scores on the test set
    predictionAndLabels = prediction.select('prediction', 'label')
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels.rdd)
    # Overall statistics
    print("----------Summary Stats----------------------")
    print(f"Weighted precision: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'weightedPrecision'})}")
    print(f"Weighted recall: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'weightedRecall'})}")
    print(f"F1 Score: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'f1'})}")
    print(f"Accuracy: {multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: 'accuracy'})}")

    # Statistics by class
    print("--------Stats by class----------------------")
    labels = [row.asDict()['label'] for row in test.select('label').distinct().collect()]
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Score = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    #print("--------Weighted Stats----------------------")
    #print("Weighted precision = %s" % metrics.weightedPrecision)
    #print("Weighted recall = %s" % metrics.weightedRecall) 
    #print("Weighted F1 Score = %s" % metrics.weightedFMeasure())
    #print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    print("-----------------------------------------------------------")

Exemple #8

0

Afficher le fichier

    def calculate_metrics(self, df):
        """

    define your own metrics to evaluate cross validation

    :params:

    df: dataframe containing {aprediction} and {label} columns

    :returns:

    confusion matrix

    """

        # turn gt into label
        preds_and_labels = df.select('prediction',
                                     f.col('label').cast(t.FloatType()))
        metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

        # confusion matrix
        metrics_dict = dict(
            # unweighted measures
            tpr=metrics.truePositiveRate(label=1.0),
            fpr=metrics.falsePositiveRate(label=1.0),
            precision=metrics.precision(label=1.0),
            recall=metrics.recall(label=1.0),
            fMeasure=metrics.fMeasure(label=1.0))

        metrics_dict = {
            k: round(v, 3) if k != "confusion" else v
            for k, v in metrics_dict.items()
        }

        return metrics_dict

Exemple #9

0

Afficher le fichier

def evaluate(df, labelCols, gettopX=-1, getfirstX=-1):
    labelCols2 = [i + "_pred" for i in labelCols]
    df.cache()

    r_list = {
        i: np.zeros((len(labelCols)))
        for i in ['accuracy', 'precision', 'recall', 'fmeasure']
    }
    for i in xrange(len(labelCols)):
        predandlabels = df.select(labelCols2[i], labelCols[i]).rdd \
                        .map(lambda x: (float(x[labelCols2[i]]), float(x[labelCols[i]])))
        metrics = MulticlassMetrics(predandlabels)

        # print metrics.confusionMatrix()
        r_list['accuracy'][i] = metrics.accuracy
        r_list['precision'][i] = metrics.precision(1.0)
        r_list['recall'][i] = metrics.recall(1.0)
        r_list['fmeasure'][i] = metrics.fMeasure(label=1.0)

    results = {}
    for m, rs in r_list.iteritems():
        results[m] = np.mean(rs)

    for code, num in [('top', gettopX), ('first', getfirstX)]:
        if num <= 0: continue

        if code == 'top':
            idx = np.argsort(np.nan_to_num(r_list['fmeasure']))[-num:]
        elif code == 'first':
            idx = xrange(num)

        for m, rs in r_list.iteritems():
            results['{0}_{1}'.format(m, code)] = np.mean(rs[idx])

    return results

Exemple #10

0

Afficher le fichier

Fichier : skeleton.py Projet : jonathanshuai/pyspark-notes

def print_performance_metrics(predictions):
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    auc = evaluator.evaluate(predictions,
                             {evaluator.metricName: "areaUnderROC"})
    aupr = evaluator.evaluate(predictions,
                              {evaluator.metricName: "areaUnderPR"})
    print("auc = {}".format(auc))
    print("aupr = {}".format(aupr))

    # Get RDD of predictions and labels for eval metrics
    predictionAndLabels = predictions.select("prediction", "label").rdd

    # Instantiate metrics objects
    binary_metrics = BinaryClassificationMetrics(predictionAndLabels)
    multi_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = {}".format(binary_metrics.areaUnderPR))
    # Area under ROC curve
    print("Area under ROC = {}".format(binary_metrics.areaUnderROC))
    # Accuracy
    print("Accuracy = {}".format(multi_metrics.accuracy))
    # Confusion Matrix
    print(multi_metrics.confusionMatrix())
    # F1
    print("F1 = {}".format(multi_metrics.fMeasure(1.0)))
    # Precision
    print("Precision = {}".format(multi_metrics.precision(1.0)))
    # Recall
    print("Recall = {}".format(multi_metrics.recall(1.0)))
    # FPR
    print("FPR = {}".format(multi_metrics.falsePositiveRate(1.0)))
    # TPR
    print("TPR = {}".format(multi_metrics.truePositiveRate(1.0)))

Exemple #11

0

Afficher le fichier

Fichier : hw3.py Projet : MrMouse2019/Big-Data-HW

def getF1Score(model, test_df):
    pred = model.transform(test_df)
    pl = pred.select("label", "prediction").rdd.cache()
    metrics = MulticlassMetrics(pl)
    f1score = metrics.fMeasure()
    print("the F1-score of the model is : {}".format(f1score))
    return f1score

Exemple #12

0

Afficher le fichier

Fichier : sparkMLib_decisionTree.py Projet : Junaid112/Big_Data_Analytics

def printMeasurementMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print('Precision Result of setosa: ', metrics.precision(1))
    print('Precision Result of versicolor:', metrics.precision(2))
    print('Precision Result of virginica:', metrics.precision(3))
    print('F-1 Score:         ', metrics.fMeasure())
    print('Confusion Matrix\n', metrics.confusionMatrix().toArray())

Exemple #13

0

Afficher le fichier

def evaluate(model, word_column="words", vectorizer="w2v"):
    doc2vecs_df = featurize(word_column, vectorizer)
    if type(model) == LinearSVC:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.regParam, [0.1]) \
            .build()
    elif type(model) == GBTClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [50]) \
            .build()
    elif type(model) == RandomForestClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxBins, [100]) \
            .build()
    elif type(model) == MultilayerPerceptronClassifier:
        paramGrid = ParamGridBuilder() \
             .addGrid(model.layers, [[122, 50, 2]]) \
             .build()
        # .addGrid(model.layers, [[120, 2], [120, 50, 2], [120, 75, 50, 2]]) \
    elif type(model) == FMClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.stepSize, [.01, .001]) \
            .build()
    print('Evaluating...')
    w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])
    si = StringIndexer(inputCol="LABEL", outputCol="label")
    model_evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="f1")
    classifier_pipeline = Pipeline(stages=[si, model])
    crossval = CrossValidator(estimator=classifier_pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=model_evaluator,
                              numFolds=5)
    fit_model = crossval.fit(doc2vecs_df)
    predictions = fit_model.transform(w2v_test_df)
    # predictions.toPandas().to_csv('predictions.csv')
    # predictions.groupBy('prediction', 'label', 'PRODUCT_CATEGORY')
    # predictions.describe()
    summarizer = Summarizer.metrics("mean", "count")
    predictions.select(
        summarizer.summary(predictions.filter(
            predictions.label == 1).pos)).show(truncate=False)
    preds_and_labels = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
    print('Confusion Matrix')
    print(metrics.confusionMatrix().toArray())
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    accuracy = model_evaluator.evaluate(predictions)
    trainingSummary = fit_model.bestModel.stages[-1].extractParamMap()
    print(trainingSummary)

    return accuracy

Exemple #14

0

Afficher le fichier

def printMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print 'Precision of True ', metrics.precision(1)
    print 'Precision of False', metrics.precision(0)
    print 'Recall of True    ', metrics.recall(1)
    print 'Recall of False   ', metrics.recall(0)
    print 'F-1 Score         ', metrics.fMeasure()
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()

Exemple #15

0

Afficher le fichier

Fichier : test_mllib.py Projet : Great1414/pyspark_learn

def printMetrics(result):
    metrics = MulticlassMetrics(result)
    print("\nPrecision of True\n", metrics.precision(1))
    print("\nPrecision of False\n", metrics.precision(0))
    print("\nRecall of True\n", metrics.recall(1))
    print("\nRecall of False\n", metrics.recall(0))
    print("\nF1 score\n", metrics.fMeasure())
    print("\nConfusion Matrix\n", metrics.confusionMatrix().toArray())

Exemple #16

0

Afficher le fichier

def evaluate(df_prediction):
    evaluator = BinaryClassificationEvaluator()
    rc = evaluator.evaluate(df_prediction, {evaluator.metricName: "areaUnderROC"})
    pr = evaluator.evaluate(df_prediction, {evaluator.metricName: "areaUnderPR"})
    predictionRDD = df_prediction.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)
    f1 = metrics.fMeasure()
    return [roc,pr,f1]

Exemple #17

0

Afficher le fichier

def displayMetrics(pred):
    ev = MulticlassMetrics(pred.select(["label", "prediction"]).rdd)

    # Overall statistics
    print("Accuracy = %s" % ev.accuracy)
    print("Precision = %s" % ev.precision())
    print("Recall = %s" % ev.recall())
    print("F1 Score = %s" % ev.fMeasure())

Exemple #18

0

Afficher le fichier

Fichier : supervised_test.py Projet : peeyushster/Machine-Learning

def main(spark, model_file, data_file):
    '''Main routine for supervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    # Load data.
    dataset = spark.read.parquet(data_file)

    # Load  model.
    model = PipelineModel.load(model_file)

    prediction = model.transform(dataset)

    predictionAndLabels = prediction.select(["prediction", "label"]).rdd

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    print("\n")
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("\n")
    labels = predictionAndLabels.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" %
              (label, metrics.fMeasure(label, beta=1.0)))

Exemple #19

0

Afficher le fichier

def evaluate(pl):  # input predictionsAndLabels
    testErr = predictionsAndLabels.filter(
        lambda lp: lp[0] != lp[1]).count() / float(predictionsAndLabels.count())
    metrics = MulticlassMetrics(predictionsAndLabels)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    return testErr, precision, recall, f1Score

Exemple #20

0

Afficher le fichier

Fichier : random_forest_mllib.py Projet : Ulitochka/ASDM_labs

def metrics_basic(data):
    metrics = MulticlassMetrics(data)
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

Exemple #21

0

Afficher le fichier

    def printMetrics(self, preds, prediction="indexedLabel", indexedLabel="prediction"):
        metrics = MulticlassMetrics(preds.select(prediction, indexedLabel).rdd)

        labels = [0, 1]
        for label in sorted(labels):
            try:
                print("Class %s precision = %s" % (label, metrics.precision(label)))
                print("Class %s recall = %s" % (label, metrics.recall(label)))
                print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
            except:
                print("No malicious predictions")

Exemple #22

0

Afficher le fichier

Fichier : stage-one-v2.py Projet : zzeqii/Apache-spark-machine-learning

def classification_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate calculate class level metrics
    metrics = MulticlassMetrics(prediction_and_labels)
    classes = set(actual_data.rdd.map(lambda x: x.test_label[0]).collect())
    print('Class\tPrecision\tRecall\tF-Score')
    for c in sorted(classes):
        print('{}\t{}\t{}\t{}'.format(c, round(metrics.precision(c), 3),
                                      round(metrics.recall(c), 3),
                                      round(metrics.fMeasure(c), 3)))

Exemple #23

0

Afficher le fichier

Fichier : train_model.py Projet : Danahirmt/Proyecto-dpa-RITA

def evaluate(predictionAndLabels):
    log = {}

    # Show Validation Score (AUROC)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['AUROC'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under ROC = {}".format(log['AUROC']))

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['AUPR'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under PR = {}".format(log['AUPR']))

    # Metrics
    predictionRDD = predictionAndLabels.select(['label', 'prediction']) \
                            .rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Confusion Matrix
    print(metrics.confusionMatrix().toArray())

    # Overall statistics
    log['precision'] = "%s" % metrics.precision()
    log['recall'] = "%s" % metrics.recall()
    log['F1 Measure'] = "%s" % metrics.fMeasure()
    print("[Overall]\tprecision = %s | recall = %s | F1 Measure = %s" % \
            (log['precision'], log['recall'], log['F1 Measure']))

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        log[label] = {}
        log[label]['precision'] = "%s" % metrics.precision(label)
        log[label]['recall'] = "%s" % metrics.recall(label)
        log[label]['F1 Measure'] = "%s" % metrics.fMeasure(label, beta=0.5)
        print("[Class %s]\tprecision = %s | recall = %s | F1 Measure = %s" \
                  % (label, log[label]['precision'],
                    log[label]['recall'], log[label]['F1 Measure']))

    return log

Exemple #24

0

Afficher le fichier

Fichier : meteos-script-1.6.0.py Projet : ncarkaci/meteos

    def evaluateClassification(self, predictionAndLabels):

        metrics = MulticlassMetrics(predictionAndLabels)
        cm = metrics.confusionMatrix()

        result = {}

        result['Matrix'] = cm.toArray().tolist()
        result['Precision'] = metrics.precision()
        result['Recall'] = metrics.recall()
        result['F1 Score'] = metrics.fMeasure()

        return result

Exemple #25

0

Afficher le fichier

def overall_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate actual / predicted labels in rdd from
    metrics = MulticlassMetrics(prediction_and_labels)

    # Calculate overall level metrics
    # print('Precision:', metrics.precision(), type(metrics.precision()))
    return sc.parallelize([
        (Vectors.dense(metrics.precision()), Vectors.dense(metrics.recall()),
         Vectors.dense(metrics.fMeasure()))
    ]).toDF(['Precision', 'Recall', 'F-Score'])

Exemple #26

0

Afficher le fichier

Fichier : test_bayes_saved_model.py Projet : varsha-varadarajan/sparksentimeter

def main():
    sc = SparkContext(appName="BayesClassifer")
    htf = HashingTF(50000)
    data = sc.textFile('/home/varshav/work/PycharmProjects/Sentiment/cleaned_bayes_labels.csv')
    data_cleaned = data.map(lambda line : line.split(","))
    # Create an RDD of LabeledPoints using category labels as labels and tokenized, hashed text as feature vectors
    data_hashed = data_cleaned.map(lambda (label, text): LabeledPoint(label, htf.transform(text)))
    data_hashed.persist()
    # data = sc.textFile('/home/admin/work/spark-1.4.1-bin-hadoop2.4/data/mllib/sample_naive_bayes_data.txt').map(parseLine)
    #print data
    # Split data aproximately into training (60%) and test (40%)
    training, test = data_hashed.randomSplit([0.70, 0.30], seed=0)

    sameModel = NaiveBayesModel.load(sc, "/home/varshav/work/PycharmProjects/StockAnalysis/myModel")

    print "----------"
    print sameModel.predict(htf.transform("posts jump in net profit"))

    predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
    predictionAndLabel1 = training.map(lambda p: (sameModel.predict(p.features), p.label))
    prediction = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
    prediction1 = 1.0 * predictionAndLabel1.filter(lambda (x, v): x == v).count() / training.count()
    buy_buy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == 1 and v ==1).count()


    # Instantiate metrics object
    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabel)

    # Overall statistics
    precision = metrics.precision()
    precision = normalize(precision)
    recall = metrics.recall()
    recall = normalize(recall)
    f1Score = metrics.fMeasure()
    f1Score = normalize(f1Score)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    '''
    # Statistics by class
    labels = data_hashed.map(lambda lp: lp.label).distinct().collect()

    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    '''
    '''

Exemple #27

0

Afficher le fichier

def classification_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate calculate class level metrics
    metrics = MulticlassMetrics(prediction_and_labels)
    classes = set(actual_data.rdd.map(lambda x: x.test_labels[0]).collect())
    results = [(Vectors.dense(float(c)),
                Vectors.dense(round(metrics.precision(c), 3)),
                Vectors.dense(round(metrics.recall(c), 3)),
                Vectors.dense(round(metrics.fMeasure(c), 3)))
               for c in sorted(classes)]
    return sc.parallelize(results).toDF(
        ['Class', 'Precision', 'Recall', 'F-Score'])

Exemple #28

0

Afficher le fichier

Fichier : retrain.py Projet : tristaneljed/amazon-kinesis-sagemaker-promotion-recommendations

def validate_tffm(spark, sc, model, test_df, s3_metrics_path, s3_endpoint_path):
    # get predictions
    validation_df = model.transform(test_df)
    
    metricsSchema = StructType() \
        .add("metric", StringType()) \
        .add("value", DoubleType())
    metrics_names = []

    # apply threshold
    def thresholdScore(x):
        retval = 0.0
        if x > 0.5:
            retval = 1.0
        return retval
    
    thresholdScoreUdf = F.UserDefinedFunction(thresholdScore, T.FloatType())
    
    validation_df_round = validation_df.withColumn('rscore', thresholdScoreUdf(validation_df.score)) 
    predTffm = validation_df_round.select(['label','rscore'])

    predictionAndLabelsTffm = predTffm.rdd.map(lambda lp: (lp.rscore, lp.label))
    metricsTffm = BinaryClassificationMetrics(predictionAndLabelsTffm)

    metrics_names.append(("Area_under_PR",metricsTffm.areaUnderPR))
    metrics_names.append(("Area_under_ROC",metricsTffm.areaUnderROC))

    mmetricsTffm = MulticlassMetrics(predictionAndLabelsTffm)
    metrics_names.append(("Precision",mmetricsTffm.precision()))
    metrics_names.append(("Recall",mmetricsTffm.recall()))
    metrics_names.append(("F1",mmetricsTffm.fMeasure()))
    metrics_names.append(("Weighted_recall",mmetricsTffm.weightedRecall))
    metrics_names.append(("Weighted_precision",mmetricsTffm.weightedPrecision))
    metrics_names.append(("Weighted_F1",mmetricsTffm.weightedFMeasure()))
    metrics_names.append(("Weighted_F05",mmetricsTffm.weightedFMeasure(beta=0.5)))
    metrics_names.append(("Weighted_FP_rate",mmetricsTffm.weightedFalsePositiveRate))

    mRdd = sc.parallelize(metrics_names).coalesce(1)
    dfMetrics = spark.createDataFrame(mRdd, metricsSchema)
    dfMetrics.write.csv("{0}/{1}".format(s3_metrics_path, model.endpointName), mode="overwrite")

    endpointSchema = StructType() \
        .add("time", StringType()) \
        .add("endpoint", StringType())
    endpoint_name = []
    endpoint_name.append((str(time.time()),str(model.endpointName)))
    eRdd = sc.parallelize(endpoint_name).coalesce(1)
    dfEndpoint = spark.createDataFrame(eRdd, endpointSchema)
    dfEndpoint.write.csv("{0}/endpoint.txt".format(s3_endpoint_path), mode="overwrite")

Exemple #29

0

Afficher le fichier

Fichier : Json.py Projet : honeycombcmu/SparkService

def generateJson(AlgorithmName, taskid, traindata, predictionAndLabels):
	jsonContent = dict()
	jsonContent['AlgorithmName'] = AlgorithmName
	jsonContent['TaskId'] = taskid

	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	jsonContent['LabelNum'] = len(labels)

	metrics = MulticlassMetrics(predictionAndLabels)
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	confusion_matrix = metrics.confusionMatrix().toArray()

	jsonContent['Precision'] = precision
	jsonContent['Recall'] = recall
	jsonContent['F1Score'] = f1Score
	jsonContent['ConfusionMatrix'] = confusion_matrix.tolist()

	jsonContent['Labels'] = list()
	for label in sorted(labels):
		tempList = dict()
		tempList['Precision'] = metrics.precision(label)
		tempList['Recall'] = metrics.recall(label)
		tempList['F1Measure'] = metrics.fMeasure(label, beta=1.0)

		jsonContent['Labels'].append(tempList)
	
	jsonContent['WeightedStats'] = dict()
	jsonContent['WeightedStats']['Precision'] = metrics.weightedRecall
	jsonContent['WeightedStats']['F1Score'] = metrics.weightedFMeasure()
	jsonContent['WeightedStats']['FalsePositiveRate'] = metrics.weightedFalsePositiveRate

	with open(taskid + '.json', 'w') as jsonFile:
		json.dump(jsonContent, jsonFile, indent=4, separators=(',', ': '))
		jsonFile.flush()

Exemple #30

0

Afficher le fichier

def performance(predictions):
    predictionRDD = predictions.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
        
    binmetrics = BinaryClassificationMetrics(predictionRDD)
    metrics = MulticlassMetrics(predictionRDD)
    
    results = {'predictions':predictions,
               'areaUnderROC':binmetrics.areaUnderROC,
               'areaUnderPR':binmetrics.areaUnderPR,
               'confusionMatrix':metrics.confusionMatrix().toArray(),
               'accuracy':metrics.accuracy,
               'precision':metrics.precision(),
               'recall':metrics.recall(),
               'f1measure':metrics.fMeasure()}
    
    return results

Exemple #31

0

Afficher le fichier

Fichier : stage-one-v2.py Projet : zzeqii/Apache-spark-machine-learning

def overall_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate actual / predicted labels in rdd from
    metrics = MulticlassMetrics(prediction_and_labels)

    # Calculate overall level metrics
    # print('Precision:', metrics.precision(), type(metrics.precision()))
    # return sc.parallelize([(Vectors.dense(metrics.accuracy),
    #                         Vectors.dense(metrics.precision()),
    #                         Vectors.dense(metrics.recall()),
    #                         Vectors.dense(metrics.fMeasure()))]).toDF(['Accuracy', 'Precision', 'Recall', 'F - Score'])
    print('Accuracy\tPrecision\tRecall\tF-Score')
    print('{}\t{}\t{}\t{}'.format(metrics.accuracy, metrics.precision(),
                                  metrics.recall(), metrics.fMeasure()))

Exemple #32

0

Afficher le fichier

 def performancerdd(self):
     self.calculator = 'RDDs'
     print('Calculating performance metrics using RDDs...')
     predictionRDD = self.predictions.select(['label','prediction']).rdd.map(lambda line: (line[1],line[0]))
     
     binmetrics = BinaryClassificationMetrics(predictionRDD)
     metrics = MulticlassMetrics(predictionRDD)
     
     self.areaUnderROC = binmetrics.areaUnderROC
     self.areaUnderPR = binmetrics.areaUnderPR
     self.confusionMatrix = metrics.confusionMatrix().toArray()
     self.accuracy = metrics.accuracy
     self.precision = metrics.precision()
     self.recall = metrics.recall()
     self.f1measure = metrics.fMeasure()
     self.falsePositive = metrics.falsePositiveRate(1.0)
     self.falseNegative = metrics.falsePositiveRate(0.0)

Exemple #33

0

Afficher le fichier

def printFinalResultMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print '\n'
    print 'Precision of Setosa ', metrics.precision(1)
    print 'Precision of Versicolor', metrics.precision(2)
    print 'Precision of Virginica', metrics.precision(3)
    print '\n'
    print 'Recall of Setosa    ', metrics.recall(1)
    print 'Recall of Versicolor   ', metrics.recall(2)
    print 'Recall of Virginica   ', metrics.recall(3)

    print '\n'
    print 'F-1 Score         ', metrics.fMeasure()
    print '\n\n'
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()

    print '\n\n'
    return

Exemple #34

0

Afficher le fichier

Fichier : main.py Projet : GuruTeja/iHear-Server

def modelStatistics(labelsAndPredictions):
    metrics = MulticlassMetrics(labelsAndPredictions)
    print(metrics.confusionMatrix())

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

Exemple #35

0

Afficher le fichier

Fichier : jordan_hive_spark.py Projet : arifyali/Yelp

model = SVMWithSGD.train(trainParsed, iterations=100)

# Training Error
trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count())
print trainErr

# Test Error
testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count())
print testErr

metrics = BinaryClassificationMetrics(testLabelsAndPreds)

print metrics.areaUnderROC
print metrics.areaUnderPR

mcMetrics = MulticlassMetrics(testLabelsAndPreds)

#TODO: Do this for classes 1.0,0.0 and not just overall
print mcMetrics.precision()
print mcMetrics.recall()
print mcMetrics.fMeasure()

model.save(sc, "SVMModel")

### Run Model on Validation Set
## TODO: output file of zipcodes and predicted success metrics
## TODO: Use bokeh on file to make visualization of the US

Exemple #36

0

Afficher le fichier

Fichier : evaluate.py Projet : stevencox/chemotext

    def train_model (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        labeled = Evaluate.load_all (sc, conf). \
                  map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0,
                                                 features = [ b.paraDist, b.sentDist, b.docDist ] ) )

#        labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \
#                  map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0,
#                                                 [ b, b * 2, b * 9 ] ) )
#        print (labeled.collect ())

        train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345)

        count = train.count ()
        start = time.time ()
        model = LogisticRegressionWithLBFGS.train (train)
        elapsed = time.time () - start
        print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed))

        start = time.time ()
        model_path = os.path.join (conf.output_dir, "eval", "model")
        file_path = model_path.replace ("file://", "")
        if os.path.isdir (file_path):
            print ("Removing existing model {0}".format (file_path))
            shutil.rmtree (file_path)
        model.save(sc, model_path)
        sameModel = LogisticRegressionModel.load(sc, model_path)
        elapsed = time.time () - start
        print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed))


        # Metrics
        labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count())
        print("Training Error => {0}".format (trainErr))

        predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) ))
        metrics = MulticlassMetrics (predictionsAndLabels) 
        print (" --------------> {0}".format (predictionsAndLabels.take (1000)))

        #print (labelsAndPreds.collect ())
        print ("\nMETRICS:")
        try:
            print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0)))
            print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("precision          : {0}".format (metrics.precision(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("recall             : {0}".format (metrics.recall(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("fMeasure           : {0}".format (metrics.fMeasure(0.0, 2.0)))
        except:
            traceback.print_exc ()

        print ("confusion matrix   : {0}".format (metrics.confusionMatrix().toArray ()))
        print ("precision          : {0}".format (metrics.precision()))
        print ("recall             : {0}".format (metrics.recall()))
        print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate))
        print ("weighted precision : {0}".format (metrics.weightedPrecision))
        print ("weighted recall    : {0}".format (metrics.weightedRecall))
        print ("weight f measure   : {0}".format (metrics.weightedFMeasure()))
        print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0)))
        print ("")

        # Regression metrics
        predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) )

        regression_metrics = RegressionMetrics (predictedAndObserved)
        print ("explained variance......: {0}".format (regression_metrics.explainedVariance))
        print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError))
        print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError))
        print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError))
        print ("r2......................: {0}".format (regression_metrics.r2))
        print ("")

        labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features)))
        testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ())
        print ("Testing Error => {0}".format (testErr))

Exemple #37

0

Afficher le fichier

Fichier : multi_class_metrics_example.py Projet : lhfei/spark-in-action

    training, test = data.randomSplit([0.6, 0.4], seed=11)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=3)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)

Exemple #38

0

Afficher le fichier

Fichier : logistic_regression.py Projet : honeycombcmu/SparkService

def logisticRegression(trainFile, testFile, taskid, sc):
	# Load training data in LIBSVM format
	trainData = MLUtils.loadLibSVMFile(sc, trainFile)
	testData = MLUtils.loadLibSVMFile(sc, testFile)

	# Split data into training (60%) and test (40%)
	# traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	# traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	labelNum = trainData.map(lambda lp: lp.label).distinct().count()

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(trainData, numClasses=labelNum)

	# Compute raw scores on the test set
	predictionAndLabels = testData.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	Json.generateJson("LogisticRegression", taskid, trainData, predictionAndLabels);
	# Instantiate metrics object
	metrics = MulticlassMetrics(predictionAndLabels)

	# Overall statistics
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	#confusion_matrix = metrics.confusionMatrix().toArray()

	print("Summary Stats")
	print("Precision = %s" % precision)
	print("Recall = %s" % recall)
	print("F1 Score = %s" % f1Score)


	# Statistics by class
	labels = trainData.map(lambda lp: lp.label).distinct().collect()
	for label in sorted(labels):
	    print("Class %s precision = %s" % (label, metrics.precision(label)))
	    print("Class %s recall = %s" % (label, metrics.recall(label)))
	    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

	# Weighted stats
	print("Weighted recall = %s" % metrics.weightedRecall)
	print("Weighted precision = %s" % metrics.weightedPrecision)
	print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
	print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
	print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

	# #return model parameters
	# res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
	# 	   ('3','Yes','Precision', metrics.precision(0.0)),
	# 	   ('4','Yes','Recall', metrics.recall(0.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
	#        ('3','Yes','Precision', metrics.precision(1.0)),
	# 	   ('4','Yes','Recall', metrics.recall(1.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
	#        ('3','Yes','Precision', metrics.precision(2.0)),
	#        ('4','Yes','Recall', metrics.recall(2.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]

	# #save output file path as JSON and dump into dumpFilePath
	# rdd = sc.parallelize(res)
	# SQLContext.createDataFrame(rdd).collect()
	# df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value'])

	#tempDumpFilePath = dumpFilePath + "/part-00000"
	#if os.path.exists(tempDumpFilePath):
	#	os.remove(tempDumpFilePath)

	#df.toJSON().saveAsTextFile(hdfsFilePath)
	#tmpHdfsFilePath = hdfsFilePath + "/part-00000"
	#subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath])

	# Save and load model
	#clusters.save(sc, "myModel")
	#sameModel = KMeansModel.load(sc, "myModel")

Exemple #39

0

Afficher le fichier

Fichier : multi_class_1.py Projet : mon95/Kaggle-Scripts

    training, test = data.randomSplit([0.85, 0.15], seed=11L)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=39)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)
    
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    #accuracy = metrics.accuracy
    accuracy = 1.0 * predictionAndLabels.filter(lambda (x, v): x == v).count() / test.count()
    # print("Summary Stats")
    # print("Precision = %s" % precision)
    # print("Recall = %s" % recall)
    # print("F1 Score = %s" % f1Score)    
    # print("Accuracy = %s" % accuracy)
    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    # for label in sorted(labels):
    #     print("Class %s precision = %s" % (label, metrics.precision(label)))
    #     print("Class %s recall = %s" % (label, metrics.recall(label)))
    #     print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats

Exemple #40

0

Afficher le fichier

Fichier : cf_example.py Projet : Lomascolo/hermes

def test_prfs():
    # TODO: revised so that it will take user's inputs instead of hardcoded values
    
    """
    Test Precision, Recall, Fscore, and Support on multiclass classification data
    Input data: https://github.com/apache/spark/blob/master/data/mllib/sample_multiclass_classification_data.txt.
    """

    # load the schemas (if existed)

    # create a hdfs directory
    #os.system("hdfs dfs -mkdir datasets")

    # load the data file into the hdfs directory
    os.system("hdfs dfs -put sample_multiclass_classification_data.txt datasets/sample_multiclass_classification_data.txt")
    data = MLUtils.loadLibSVMFile(scsingleton.sc, "hdfs://localhost:9000/datasets/sample_multiclass_classification_data.txt")
   
    # print data.take(1)
    # ie. [LabeledPoint(1.0, (4,[0,1,2,3],[-0.222222,0.5,-0.762712,-0.833333]))] 
    # [ ( finalClassification, (numLabels, [label0, label1, label2, ..., labelN], [prob0, prob1, prob2, ..., probN]) ) ]

    # split data into train (60%), test (40%)
    trainingRDD, testRDD = data.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    with Timer() as t:
        numTest = testRDD.count()
    print "testRDD.count(): %s seconds" % t.secs

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3)
    print "LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredAndLabel = testRDD.map(lambda lp: (float(model.predict(lp.features)), lp.label))
    print "testPredAndLabel: %s seconds" % t.secs

    # calculate Precision, Recall, F1-score
    metrics = MulticlassMetrics(testPredAndLabel)
    print( "precision = %s" % metrics.precision() )
    print( "recall = %s" % metrics.recall() )
    print( "f1-score = %s" % metrics.fMeasure() )

    # statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print( "Class %s precision = %s" % (label, metrics.precision(label)) )
        print( "Class %s recall = %s" % (label, metrics.recall(label)) )
        print( "Class %s f1-score = %s" % (label, metrics.fMeasure(label, beta=1.0)) )

    # weighted stats
    print( "Weighted precision = %s" % metrics.weightedPrecision )
    print( "Weighted recall = %s" % metrics.weightedRecall )
    print( "Weighted f1-score = %s" % metrics.weightedFMeasure() )
    print( "Weighted f(0.5)-score = %s" % metrics.weightedFMeasure(beta=0.5) )
    print( "Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate )
    
    return

Exemple #41

0

Afficher le fichier

Fichier : PySpark.py Projet : boweiz/SparkService

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)

	# Compute raw scores on the test set
	predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	# Instantiate metrics object
	metrics = MulticlassMetrics(predictionAndLabels)

	# Overall statistics
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	#confusion_matrix = metrics.confusionMatrix().toArray()

	print("Summary Stats")
	print("Precision = %s" % precision)
	print("Recall = %s" % recall)
	print("F1 Score = %s" % f1Score)


	# Statistics by class
	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	for label in sorted(labels):
	    print("Class %s precision = %s" % (label, metrics.precision(label)))
	    print("Class %s recall = %s" % (label, metrics.recall(label)))
	    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))