コード例 #1
0
 def printRddMulticlassClassificationMetrics(self, predictions_and_labels):
     metrics = MulticlassMetrics(predictions_and_labels)
     print "KAPPA=" + str(
         self.computeKappa(np.array(metrics.confusionMatrix().toArray())))
     print "BA=" + str(
         self.computeBA(np.array(metrics.confusionMatrix().toArray())))
     CMarray = metrics.confusionMatrix().toArray()
     #CMstring = ','.join(['%.5f' % num for num in CMarray])
     print "CM=" + str(CMarray)
コード例 #2
0
def printMetrics(predictions_and_labels, output_file):
   metrics = MulticlassMetrics(predictions_and_labels)
   output_file.write('Precision of True '+str(metrics.precision(1))+'\n')
   output_file.write('Precision of False' + str(metrics.precision(0))+'\n')
   output_file.write('Recall of True  '+str(metrics.recall(1))+'\n')
   output_file.write('Recall of False   '+str(metrics.recall(0))+'\n')
   output_file.write('F-1 Score         '+str(metrics.fMeasure())+'\n')
   output_file.write('Confusion Matrix\n'+str(metrics.confusionMatrix().toArray())+'\n')

   print('Precision of True '+str(metrics.precision(1)))
   print('Precision of False'+str(metrics.precision(0)))
   print('Recall of True  '+str(metrics.recall(1)))
   print('Recall of False   '+str(metrics.recall(0)))
   print('F-1 Score         '+str(metrics.fMeasure()))
   print('Confusion Matrix\n'+str(metrics.confusionMatrix().toArray()))
コード例 #3
0
    def set_metrics(self, evaluator, data, objective_column):

        start = time.time()
        if evaluator is not None and data is not None:
                self['AUC'] = evaluator.evaluate(data,  {evaluator.metricName: "areaUnderROC"})
                self['AUPR'] = evaluator.evaluate(data, {evaluator.metricName: "areaUnderPR"})
                self['nobs'] = data.count()
                self['model_category'] = 'Binomial'
                self['max_criteria_and_metric_scores'] = None
                self['RMSE'] = 10e+308

                #Generating ConfusionMatrix
                tp = data.select("prediction", data[objective_column].cast('float'))\
                    .toDF("prediction", objective_column).rdd.map(tuple)
                metrics = MulticlassMetrics(tp)
                pdf = DataFrame(data=np.array(metrics.confusionMatrix().values).reshape((2, 2)),
                                columns=['0', '1'])
                pdf['total'] = pdf.sum(axis=1)
                index = pdf.index.tolist()
                index.append('total')
                pdf = pdf.append(pdf.sum(axis=0), ignore_index=True)
                pdf.index = index
                self['cm'] = json.loads(pdf.to_json(orient='split'), object_pairs_hook=OrderedDict)

                self['scoring_time'] = int(time.time() - start)
コード例 #4
0
ファイル: e.py プロジェクト: UmeLan/Allstate-Prediction
def printMetrics(pred_and_label):
    metrics = MulticlassMetrics(pred_and_label)
    print 'Preicision of 0', metrics.precision(0)
    print 'Preicision of 1', metrics.precision(1)
    print 'Recall of 0', metrics.recall(0)
    print 'Recall of 1', metrics.recall(1)
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()
コード例 #5
0
def print_performance_metrics(predictions):
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    auc = evaluator.evaluate(predictions,
                             {evaluator.metricName: "areaUnderROC"})
    aupr = evaluator.evaluate(predictions,
                              {evaluator.metricName: "areaUnderPR"})
    print("auc = {}".format(auc))
    print("aupr = {}".format(aupr))

    # Get RDD of predictions and labels for eval metrics
    predictionAndLabels = predictions.select("prediction", "label").rdd

    # Instantiate metrics objects
    binary_metrics = BinaryClassificationMetrics(predictionAndLabels)
    multi_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = {}".format(binary_metrics.areaUnderPR))
    # Area under ROC curve
    print("Area under ROC = {}".format(binary_metrics.areaUnderROC))
    # Accuracy
    print("Accuracy = {}".format(multi_metrics.accuracy))
    # Confusion Matrix
    print(multi_metrics.confusionMatrix())
    # F1
    print("F1 = {}".format(multi_metrics.fMeasure(1.0)))
    # Precision
    print("Precision = {}".format(multi_metrics.precision(1.0)))
    # Recall
    print("Recall = {}".format(multi_metrics.recall(1.0)))
    # FPR
    print("FPR = {}".format(multi_metrics.falsePositiveRate(1.0)))
    # TPR
    print("TPR = {}".format(multi_metrics.truePositiveRate(1.0)))
コード例 #6
0
def printMeasurementMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print('Precision Result of setosa: ', metrics.precision(1))
    print('Precision Result of versicolor:', metrics.precision(2))
    print('Precision Result of virginica:', metrics.precision(3))
    print('F-1 Score:         ', metrics.fMeasure())
    print('Confusion Matrix\n', metrics.confusionMatrix().toArray())
コード例 #7
0
def custom_evaluation(pred, model_name):
    '''
    Perform custom evaluation of predictions
    
    1. Inspect with PySpark.ML evaluator will use for pipeline
    2. Use RDD-API; PySpark.MLLib to get metrics based on predictions 
    3. Display confusion matrix
    
    Inputs:
        preds PySpark.ml.DataFrame - predictions from model
    '''
    pr = BinaryClassificationEvaluator(metricName='areaUnderPR')
    pr_auc = pr.evaluate(pred)
    print(f"{model_name} -> PR AUC: {pr_auc}")
    predictionRDD = pred.select(['label', 'prediction'
                                 ]).rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    print(f"{model_name}\n | precision = {metrics.precision()}")
    print(
        f" | recall = {metrics.recall()}\n | F1-Score = {metrics.fMeasure()}")

    conf_matrix = metrics.confusionMatrix().toArray()
    sns.set(font_scale=1.4)  #for label size
    ax = sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 16})
    ax.set(xlabel='Predicted Label',
           ylabel='True Label',
           title='Confusion Mtx')
    plt.show()
コード例 #8
0
def evaluate(model, word_column="words", vectorizer="w2v"):
    doc2vecs_df = featurize(word_column, vectorizer)
    if type(model) == LinearSVC:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.regParam, [0.1]) \
            .build()
    elif type(model) == GBTClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxIter, [50]) \
            .build()
    elif type(model) == RandomForestClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxBins, [100]) \
            .build()
    elif type(model) == MultilayerPerceptronClassifier:
        paramGrid = ParamGridBuilder() \
             .addGrid(model.layers, [[122, 50, 2]]) \
             .build()
        # .addGrid(model.layers, [[120, 2], [120, 50, 2], [120, 75, 50, 2]]) \
    elif type(model) == FMClassifier:
        paramGrid = ParamGridBuilder() \
            .addGrid(model.stepSize, [.01, .001]) \
            .build()
    print('Evaluating...')
    w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2])
    si = StringIndexer(inputCol="LABEL", outputCol="label")
    model_evaluator = MulticlassClassificationEvaluator(
        labelCol="label", predictionCol="prediction", metricName="f1")
    classifier_pipeline = Pipeline(stages=[si, model])
    crossval = CrossValidator(estimator=classifier_pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=model_evaluator,
                              numFolds=5)
    fit_model = crossval.fit(doc2vecs_df)
    predictions = fit_model.transform(w2v_test_df)
    # predictions.toPandas().to_csv('predictions.csv')
    # predictions.groupBy('prediction', 'label', 'PRODUCT_CATEGORY')
    # predictions.describe()
    summarizer = Summarizer.metrics("mean", "count")
    predictions.select(
        summarizer.summary(predictions.filter(
            predictions.label == 1).pos)).show(truncate=False)
    preds_and_labels = predictions.select(['prediction', 'label'])
    metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
    print('Confusion Matrix')
    print(metrics.confusionMatrix().toArray())
    # Overall statistics
    precision = metrics.precision(1.0)
    recall = metrics.recall(1.0)
    f1Score = metrics.fMeasure(1.0)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    accuracy = model_evaluator.evaluate(predictions)
    trainingSummary = fit_model.bestModel.stages[-1].extractParamMap()
    print(trainingSummary)

    return accuracy
コード例 #9
0
def printMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print 'Precision of True ', metrics.precision(1)
    print 'Precision of False', metrics.precision(0)
    print 'Recall of True    ', metrics.recall(1)
    print 'Recall of False   ', metrics.recall(0)
    print 'F-1 Score         ', metrics.fMeasure()
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()
コード例 #10
0
ファイル: test_mllib.py プロジェクト: Great1414/pyspark_learn
def printMetrics(result):
    metrics = MulticlassMetrics(result)
    print("\nPrecision of True\n", metrics.precision(1))
    print("\nPrecision of False\n", metrics.precision(0))
    print("\nRecall of True\n", metrics.recall(1))
    print("\nRecall of False\n", metrics.recall(0))
    print("\nF1 score\n", metrics.fMeasure())
    print("\nConfusion Matrix\n", metrics.confusionMatrix().toArray())
コード例 #11
0
def evaluator(df):
    biclass = BinaryClassificationEvaluator()
    bieval = biclass.evaluate(df)
    predictionAndLabels = df.rdd.map(
        lambda row: (float(row['prediction']), float(row['label'])))
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()
    return bieval, confusion_matrix
コード例 #12
0
def evaluate_predictions(predictions, show=True):
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    from pyspark.mllib.evaluation import BinaryClassificationMetrics, MulticlassMetrics
    log = {}

    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['auroc'] = evaluator.evaluate(predictions)

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['aupr'] = evaluator.evaluate(predictions)

    # Metrics
    predictionRDD = predictions.select(
        ['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Overall statistics
    log['precision'] = metrics.precision()
    log['recall'] = metrics.recall()
    log['F1 Measure'] = metrics.fMeasure()

    # Statistics by class
    distinctPredictions = collect_tuple(
        predictions.select('prediction').distinct())
    for x in sorted(distinctPredictions):
        log[x] = {}
        log[x]['precision'] = metrics.precision(x)
        log[x]['recall'] = metrics.recall(x)
        log[x]['F1 Measure'] = metrics.fMeasure(x, beta=1.0)

    # Confusion Matrix
    log['cm'] = metrics.confusionMatrix().toArray()
    log['cmpercent'] = cm_percent(log['cm'], predictions.count(), show)

    if show:
        show_predictions(predictions)

        print('Confusion Matrix')
        print(' TP', 'FN\n', 'FP', 'TN')
        print(log['cm'])
        print(' PC', 'FN\n', 'FP', 'PW')
        print(log['cmpercent'])
        print('')
        print("Area under ROC = {}".format(log['auroc']))
        print("Area under AUPR = {}".format(log['aupr']))
        print('\nOverall\ntprecision = {}\nrecall = {}\nF1 Measure = {}\n'.
              format(log['precision'], log['recall'], log['F1 Measure']))

        for x in sorted(distinctPredictions):
            print('Label {}\ntprecision = {}\nrecall = {}\nF1 Measure = {}\n'.
                  format(x, log[x]['precision'], log[x]['recall'],
                         log[x]['F1 Measure']))

    return log
コード例 #13
0
def amazon_classification(sc, filename):
    '''
    Args:
        sc: The Spark Context
        filename: Filename of the Amazon reviews file to use, where each line represents a review
    '''
    # Load in reviews
    reviews = sc.textFile(filename).sample(False, 0.001)

    # Parse to csv
    csv_loads = reviews.map(loadcsv)

    #
    labeled_data = (csv_loads.filter(lambda x: x != None).mapValues(lambda x: x.split()))

    labels = labeled_data.keys()

    tf = HashingTF().transform(labeled_data.map(lambda x:x[1]))
    idf = IDF(minDocFreq=7).fit(tf)
    tfidf = idf.transform(tf)
    labeled_points = (labels.zip(tfidf)
                         .map(lambda x: LabeledPoint(float(x[0]), x[1])))

    training, test = labeled_points.randomSplit([0.6, 0.4])

    model = NaiveBayes.train(training)

    # Use our model to predict
    train_preds = (training.map(lambda x: x.label)
                           .zip(model.predict(training.map(lambda x: x.features))))
    test_preds = (test.map(lambda x: x.label)
                      .zip(model.predict(test.map(lambda x: x.features))))

    # Ask PySpark for some metrics on how our model predictions performed
    trained_metrics = MulticlassMetrics(train_preds.map(lambda x: (x[0], float(x[1]))))
    test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1]))))
    ojbk = open('./xxx.txt','w+')
    ojbk.write(str(trained_metrics.confusionMatrix().toArray()) + '\n')
    ojbk.write(str(trained_metrics.precision()) + '\n')
    ojbk.write(str(test_metrics.confusionMatrix().toArray()) + '\n')
    objk.write(str(test_metrics.precision()) + '\n')
コード例 #14
0
def main():

    csvData = spark.sql("select answer,label from training_table")
    dataset = csvData.dropna()

    dataset = dataset.toPandas()
    dataset['answer'] = dataset.answer.apply(lemmaStemma)
    dataset = spark.createDataFrame(dataset)

    train_data, test_data = dataset.randomSplit([0.7, 0.3])

    model = classifier().fit(train_data)
    prData = model.transform(test_data)
    clasDataFrame = prData.toPandas()

    evaluatorRecall = MulticlassClassificationEvaluator(
        labelCol="label",
        predictionCol="prediction",
        metricName="weightedRecall")
    recall = evaluatorRecall.evaluate(prData)
    print("Recall %s" % recall)

    ####Cross Validation####
    paramGrid = ParamGridBuilder().build()
    cv = CrossValidator()\
        .setEstimator(classifier())\
        .setEvaluator(evaluatorRecall)\
        .setEstimatorParamMaps(paramGrid)

    cvModel = cv.fit(train_data)
    cvPredictions = cvModel.transform(test_data)

    predictionsAndLabels = cvPredictions.select("prediction", "label").rdd

    metrics = MulticlassMetrics(predictionsAndLabels)
    metricsAUC = BinaryClassificationMetrics(predictionsAndLabels)
    print("Cross Validated Confusion Matrix\n = %s" %
          metrics.confusionMatrix().toArray())
    print("Cross Validated recall = %s" % metrics.weightedRecall)
    print("Cross Validated Precision = %s" % metrics.weightedPrecision)
    print("Cross Validated fMeasure = %s" % metrics.weightedFMeasure)
    print("Cross Validated Accuracy = %s" % metrics.accuracy)
    print("Cross Validated AUC = %s" % metricsAUC.areaUnderROC)

    cvPred = cvPredictions.select("answer", "label", "probability",
                                  "prediction")

    output = cvPred.rdd.map(extract).toDF(["answer", "label", "prediction"])

    ####Model Save####
    bestModel = cvModel.bestModel
    bestModel.write().overwrite().save(
        "hdfs://nameservice1//source/NPSClassification")
コード例 #15
0
    def evaluateClassification(self, predictionAndLabels):

        metrics = MulticlassMetrics(predictionAndLabels)
        cm = metrics.confusionMatrix()

        result = {}

        result['Matrix'] = cm.toArray().tolist()
        result['Precision'] = metrics.precision()
        result['Recall'] = metrics.recall()
        result['F1 Score'] = metrics.fMeasure()

        return result
コード例 #16
0
def get_metrics(df, lower_bound, upper_bound=1.0):
    rdd = df.select("prediction", "Profit").rdd
    metrics = MulticlassMetrics(rdd)
    metrics_dict = {}
    cm = metrics.confusionMatrix().toArray()

    TP = cm[0][0]
    TN = cm[1][1]
    FP = cm[0][1]
    FN = cm[1][0]
    accuracy = (TP + TN) / cm.sum()
    if accuracy < lower_bound or accuracy > upper_bound:
        return None
    sensitivity = (TP) / (TP + FN)
    specificity = (TN) / (TN + FP)
    precision = (TP) / (TP + FP)
    npv = (TN) / (TN + FN)

    # Overall statistics
    metrics_dict['accuracy'] = accuracy

    metrics_dict['sensitivity'] = sensitivity

    metrics_dict['specificity'] = specificity

    metrics_dict['precision'] = precision

    metrics_dict['npv'] = npv

    # print("Summary Stats")
    # print(metrics.confusionMatrix())
    metrics_dict['confusionMatrix'] = metrics.confusionMatrix()

    print(
        "{},{},{},{},{}".format(round(accuracy, 3), round(sensitivity, 3), round(specificity, 3), round(precision, 3),
                                round(npv, 3)))

    return metrics_dict
コード例 #17
0
def model_dev_lr(df_train, df_test, max_iter, max_depth, fit_intercept, reg_param, elasticnet_param, lr_standardize):
    
    lr_start_time = time()
    
    # Create an Initial Model Instance
    mod_lr = LogisticRegression(labelCol='label',
                                featuresCol='features',
                                aggregationDepth=max_depth,
                                elasticNetParam=elasticnet_param,
                                fitIntercept=fit_intercept,
                                maxIter=max_iter,
                                regParam=reg_param,
                                standardization=lr_standardize)
    
    # Training The Model
    lr_final_model = mod_lr.fit(df_train)
    
    # Scoring The Model On Test Sample
    lr_transformed = lr_final_model.transform(df_test)
    lr_test_results = lr_transformed.select(['prediction', 'label'])
    lr_predictionAndLabels= lr_test_results.rdd
    lr_test_metrics = MulticlassMetrics(lr_predictionAndLabels)
    
    # Collecting The Model Statistics
    lr_cm=lr_test_metrics.confusionMatrix().toArray()
    lr_accuracy=round(float((lr_cm[0][0]+lr_cm[1][1])/lr_cm.sum())*100,2)
    lr_precision=round(float((lr_cm[0][0])/(lr_cm[0][0]+lr_cm[1][0]))*100,2)
    lr_recall=round(float((lr_cm[0][0])/(lr_cm[0][0]+lr_cm[0][1]))*100,2)
    lr_auc = round(float(BinaryClassificationMetrics(lr_predictionAndLabels).areaUnderROC)*100,2)
    
    # Printing The Model Statitics
    print("\n++++++ Printing Logistic Regression Model Accuracy ++++++\n")
    print("Accuracy: "+str(lr_accuracy)+"%")
    print("AUC: "+str(lr_auc)+"%")
    print("Precision: "+str(lr_precision)+"%")
    print("Recall: "+str(lr_recall)+"%")
    
    lr_end_time = time()
    lr_elapsed_time = (lr_end_time - lr_start_time)/60
    lr_model_stat = pd.DataFrame({"Model Name" : ["Logistic Regression"],
                                  "Accuracy" : lr_accuracy,
                                  "AUC": lr_auc, 
                                  "Precision": lr_precision,
                                  "Recall": lr_recall, 
                                  "Time (Min.)": round(lr_elapsed_time,3)})
    lr_output = (lr_final_model,lr_model_stat,lr_cm)
    print("Time To Build Logistic Regression Model: %.3f Minutes" % lr_elapsed_time)
    
    return (lr_output)
コード例 #18
0
def model_dev_svm(df_train, df_test, max_depth, fit_intercept, max_iter, reg_param, svm_standardize):
    
    svm_start_time = time()
    
    # Create an Initial Model Instance
    mod_svm = LinearSVC(labelCol='label',
                        featuresCol='features',
                        aggregationDepth=max_depth,
                        fitIntercept=fit_intercept,
                        maxIter=max_iter,
                        regParam=reg_param,
                        standardization=svm_standardize)
    
    # Training The Model
    svm_final_model = mod_svm.fit(df_train)
    
    # Scoring The Model On Test Sample
    svm_transformed = svm_final_model.transform(df_test)
    svm_test_results = svm_transformed.select(['prediction', 'label'])
    svm_predictionAndLabels= svm_test_results.rdd
    svm_test_metrics = MulticlassMetrics(svm_predictionAndLabels)
    
    # Collecting The Model Statistics
    svm_cm=svm_test_metrics.confusionMatrix().toArray()
    svm_accuracy=round(float((svm_cm[0][0]+svm_cm[1][1])/svm_cm.sum())*100,2)
    svm_precision=round(float((svm_cm[0][0])/(svm_cm[0][0]+svm_cm[1][0]))*100,2)
    svm_recall=round(float((svm_cm[0][0])/(svm_cm[0][0]+svm_cm[0][1]))*100,2)
    svm_auc = round(float(BinaryClassificationMetrics(svm_predictionAndLabels).areaUnderROC)*100,2)

    # Printing The Model Statitics
    print("\n++++++ Printing SVM Model Accuracy ++++++\n")
    print("Accuracy: "+str(svm_accuracy)+"%")
    print("AUC: "+str(svm_auc)+"%")
    print("Precision: "+str(svm_precision)+"%")
    print("Recall: "+str(svm_recall)+"%")

    svm_end_time = time()
    svm_elapsed_time = (svm_end_time - svm_start_time)/60
    svm_model_stat = pd.DataFrame({"Model Name" : ["Support Vector Machine"],
                                  "Accuracy" : svm_accuracy,
                                  "AUC": svm_auc, 
                                  "Precision": svm_precision,
                                  "Recall": svm_recall, 
                                  "Time (Min.)": round(svm_elapsed_time,3)})
    svm_output = (svm_final_model,svm_model_stat,svm_cm)
    print("Time To Build SVM Model: %.3f Minutes" % svm_elapsed_time)
    
    return(svm_output)
コード例 #19
0
def printStatistics(labelsAndPredictions, data):
    metrics = MulticlassMetrics(labelsAndPredictions)
    labels = data.map(lambda lp: lp.label).distinct().collect()
    print("confusion metrics:")
    cm = metrics.confusionMatrix()
    print(cm)
    print('')
    print('accuracy: ' + str(metrics.accuracy))
    for label in labels:
        print('label: ' + str(label))
        print('fp: ' + str(metrics.falsePositiveRate(label)))
        print('tp: ' + str(metrics.truePositiveRate(label)))
    recall = metrics.recall()
    precision = metrics.precision()
    print("Recall = %s" % recall)
    print("Precision = %s" % precision)
コード例 #20
0
def performance(predictions):
    predictionRDD = predictions.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
        
    binmetrics = BinaryClassificationMetrics(predictionRDD)
    metrics = MulticlassMetrics(predictionRDD)
    
    results = {'predictions':predictions,
               'areaUnderROC':binmetrics.areaUnderROC,
               'areaUnderPR':binmetrics.areaUnderPR,
               'confusionMatrix':metrics.confusionMatrix().toArray(),
               'accuracy':metrics.accuracy,
               'precision':metrics.precision(),
               'recall':metrics.recall(),
               'f1measure':metrics.fMeasure()}
    
    return results
コード例 #21
0
 def performancerdd(self):
     self.calculator = 'RDDs'
     print('Calculating performance metrics using RDDs...')
     predictionRDD = self.predictions.select(['label','prediction']).rdd.map(lambda line: (line[1],line[0]))
     
     binmetrics = BinaryClassificationMetrics(predictionRDD)
     metrics = MulticlassMetrics(predictionRDD)
     
     self.areaUnderROC = binmetrics.areaUnderROC
     self.areaUnderPR = binmetrics.areaUnderPR
     self.confusionMatrix = metrics.confusionMatrix().toArray()
     self.accuracy = metrics.accuracy
     self.precision = metrics.precision()
     self.recall = metrics.recall()
     self.f1measure = metrics.fMeasure()
     self.falsePositive = metrics.falsePositiveRate(1.0)
     self.falseNegative = metrics.falsePositiveRate(0.0)
コード例 #22
0
def model_dev_gbm(df_train, df_test, max_depth, max_bins, max_iter):
    
    gbm_start_time = time()
    
    # Create an Initial Model Instance
    mod_gbm= GBTClassifier(labelCol='label',
                           featuresCol='features',
                           maxDepth=max_depth,
                           maxBins=max_bins,
                           maxIter=max_iter)
    
    # Training The Model
    gbm_final_model = mod_gbm.fit(df_train)
    
    # Scoring The Model On Test Sample
    gbm_transformed = gbm_final_model.transform(df_test)
    gbm_test_results = gbm_transformed.select(['prediction', 'label'])
    gbm_predictionAndLabels= gbm_test_results.rdd
    gbm_test_metrics = MulticlassMetrics(gbm_predictionAndLabels)
    
    # Collecting The Model Statistics
    gbm_cm=gbm_test_metrics.confusionMatrix().toArray()
    gbm_accuracy=round(float((gbm_cm[0][0]+gbm_cm[1][1])/gbm_cm.sum())*100,2)
    gbm_precision=round(float((gbm_cm[0][0])/(gbm_cm[0][0]+gbm_cm[1][0]))*100,2)
    gbm_recall=round(float((gbm_cm[0][0])/(gbm_cm[0][0]+gbm_cm[0][1]))*100,2)
    gbm_auc = round(float(BinaryClassificationMetrics(gbm_predictionAndLabels).areaUnderROC)*100,2)
    
    # Printing The Model Statitics
    print("\n++++++ Printing GBM Model Accuracy ++++++\n")
    print("Accuracy: "+str(gbm_accuracy)+"%")
    print("AUC: "+str(gbm_auc)+"%")
    print("Precision: "+str(gbm_precision)+"%")
    print("Recall: "+str(gbm_recall)+"%")
    gbm_end_time = time()
    
    gbm_elapsed_time = (gbm_end_time - gbm_start_time)/60
    gbm_model_stat = pd.DataFrame({"Model Name" : ["Gradient Boosting Machine"],
                                  "Accuracy" : gbm_accuracy,
                                  "AUC": gbm_auc, 
                                  "Precision": gbm_precision,
                                  "Recall": gbm_recall, 
                                  "Time (Min.)": round(gbm_elapsed_time,3)})
    gbm_output = (gbm_final_model,gbm_model_stat,gbm_cm)
    print("Time To Build GBM Model: %.3f Minutes" % gbm_elapsed_time)
    
    return(gbm_output)
コード例 #23
0
def model_dev_rf(df_train, df_test, max_depth, max_bins, n_trees):
    
    rf_start_time = time()
    
    # Create an Initial Model Instance
    mod_rf = RandomForestClassifier(labelCol='label',
                                    featuresCol='features',
                                    maxDepth=max_depth,
                                    maxBins=max_bins,
                                    numTrees=n_trees)
    
    # Training The Model
    rf_final_model = mod_rf.fit(df_train)
    
    # Scoring The Model On Test Sample
    rf_transformed = rf_final_model.transform(df_test)
    rf_test_results = rf_transformed.select(['prediction', 'label'])
    rf_predictionAndLabels = rf_test_results.rdd
    rf_test_metrics = MulticlassMetrics(rf_predictionAndLabels)
    
    # Collecting The Model Statistics
    rf_cm=rf_test_metrics.confusionMatrix().toArray()
    rf_accuracy=round(float((rf_cm[0][0]+rf_cm[1][1])/rf_cm.sum())*100,2)
    rf_precision=round(float((rf_cm[0][0])/(rf_cm[0][0]+rf_cm[1][0]))*100,2)
    rf_recall=round(float((rf_cm[0][0])/(rf_cm[0][0]+rf_cm[0][1]))*100,2)
    rf_auc = round(float(BinaryClassificationMetrics(rf_predictionAndLabels).areaUnderROC)*100,2)
    
    # Printing The Model Statitics
    print("\n++++++ Printing Random Forest Model Accuracy ++++++\n")
    print("Accuracy: "+str(rf_accuracy)+"%")
    print("AUC: "+str(rf_auc)+"%")
    print("Precision: "+str(rf_precision)+"%")
    print("Recall: "+str(rf_recall)+"%")
    
    rf_end_time = time()
    rf_elapsed_time = (rf_end_time - rf_start_time)/60
    rf_model_stat = pd.DataFrame({"Model Name" : ["Random Forest"],
                              "Accuracy" : rf_accuracy,
                              "AUC": rf_auc, 
                              "Precision": rf_precision,
                              "Recall": rf_recall, 
                              "Time (Min.)": round(rf_elapsed_time,3)})
    rf_output = (rf_final_model,rf_model_stat,rf_cm)
    print("Time To Build Random Forest Model: %.3f Minutes" % rf_elapsed_time)
    
    return (rf_output)
コード例 #24
0
ファイル: consumer.py プロジェクト: suribe06/Big_Data
def predictions(train, test):
    #Aplicamos la tecnica de GBT
    GPT = GBTClassifier(featuresCol="Atributos", labelCol="Income", maxBins=41)
    GPT = GPT.fit(train)
    predictions = GPT.transform(test)
    results = predictions.select("Income", "prediction")
    predictionAndLabels = results.rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    cm = metrics.confusionMatrix().toArray()
    #Calculo de metricas
    accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
    precision = cm[0][0] / (cm[0][0] + cm[1][0])
    recall = cm[0][0] / (cm[0][0] + cm[0][1])
    f1 = 2 * ((precision * recall) / (precision + recall))
    print("Metricas del modelo GBT Classifier")
    print("accuracy = {0}, precision = {1}, recall = {2}, f1 = {3}".format(
        accuracy, precision, recall, f1))
    return
コード例 #25
0
ファイル: machine_learning.py プロジェクト: damnqse05820/t
 def predict(self):
     #print self.predictingData.show()
     predictions = self.model.transform(self.predictingData)
     #print predictions.show()
     #df= predictions.select('prediction').collect()
     #return df[0].asDict()["prediction"]
     predictions.select("URL", "prediction", "indexedLabel",
                        "label").show(200)
     predictionAndLabels = predictions.select("prediction",
                                              "indexedLabel").rdd
     metrics = MulticlassMetrics(predictionAndLabels)
     print("TPR: {:.3%} \tFPR: {:.3%}".format(
         metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0)))
     print("TNR: {:.3%} \tFNR: {:.3%}".format(
         metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0)))
     print("Confusion Matrix:")
     for line in metrics.confusionMatrix().toArray():
         print(line)
コード例 #26
0
def printFinalResultMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print '\n'
    print 'Precision of Setosa ', metrics.precision(1)
    print 'Precision of Versicolor', metrics.precision(2)
    print 'Precision of Virginica', metrics.precision(3)
    print '\n'
    print 'Recall of Setosa    ', metrics.recall(1)
    print 'Recall of Versicolor   ', metrics.recall(2)
    print 'Recall of Virginica   ', metrics.recall(3)

    print '\n'
    print 'F-1 Score         ', metrics.fMeasure()
    print '\n\n'
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()

    print '\n\n'
    return
コード例 #27
0
ファイル: models.py プロジェクト: lidiasm/CC2-Spark
def evaluate_model(predictions, file):
    """Evaluates a model by using its predictions in order to get
        the area under the curve ROC, accuracy, Kappa coefficient and the values of
        the confusion matrix. All this data will be stored in a csv file."""
    # ROC
    evaluator = BinaryClassificationEvaluator()
    roc = round(evaluator.evaluate(predictions) * 100, 3)

    # Confusion matrix
    """Creates (prediction, label) pairs in order to use MulticlassMetrics"""
    predictionAndLabel = predictions.select("prediction", "label").rdd
    # Generate confusion matrix
    metrics = MulticlassMetrics(predictionAndLabel)
    cnf_matrix = metrics.confusionMatrix()
    cnf_matrix_list = cnf_matrix.toArray().tolist()
    tn = int(cnf_matrix_list[0][0])
    fn = int(cnf_matrix_list[1][0])
    fp = int(cnf_matrix_list[0][1])
    tp = int(cnf_matrix_list[1][1])
    total = tn + fn + fp + tp

    # Kappa Coefficient
    prob_observed = float(tp + tn) / total
    prob_expected = float(((tn + fp) * (tn + fn)) +
                          ((fn + tp) * (fp + tp))) / (total * total)
    kappa = (float(prob_observed - prob_expected) / (1 - prob_expected))
    kappa = round(kappa * 100, 3)

    # Accuracy
    accuracy = round(metrics.accuracy * 100, 3)
    """Store the results as a dataframe in a csv file"""
    results = [(str(roc), str(accuracy), str(kappa), str(tn), str(fn), str(fp),
                str(tp))]
    schema = StructType([
        StructField('ROC', StringType(), False),
        StructField('Accuracy', StringType(), False),
        StructField('Kappa', StringType(), False),
        StructField('TN', StringType(), False),
        StructField('FN', StringType(), False),
        StructField('FP', StringType(), False),
        StructField('TP', StringType(), False)
    ])
    results_df = ss.createDataFrame(results, schema)
    results_df.write.csv(file, header=True, mode="overwrite")
コード例 #28
0
ファイル: evaluation.py プロジェクト: FoundryAI/handyspark
def confusionMatrix(self, threshold=0.5):
    """Returns confusion matrix: predicted classes are in columns,
    they are ordered by class label ascending, as in "labels".

    Predicted classes are computed according to informed threshold.

    Parameters
    ----------
    threshold: double, optional
        Threshold probability for the positive class.
        Default is 0.5.

    Returns
    -------
    confusionMatrix: DenseMatrix
    """
    scoreAndLabels = self.call2('scoreAndLabels').map(lambda t: (float(t[0] > threshold), t[1]))
    mcm = MulticlassMetrics(scoreAndLabels)
    return mcm.confusionMatrix()
コード例 #29
0
ファイル: main.py プロジェクト: GuruTeja/iHear-Server
def modelStatistics(labelsAndPredictions):
    metrics = MulticlassMetrics(labelsAndPredictions)
    print(metrics.confusionMatrix())

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
コード例 #30
0
def evaluate(predictionAndLabels):
    log = {}

    # Show Validation Score (AUROC)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['AUROC'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under ROC = {}".format(log['AUROC']))

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['AUPR'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under PR = {}".format(log['AUPR']))

    # Metrics
    predictionRDD = predictionAndLabels.select(['label', 'prediction']) \
                            .rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Confusion Matrix
    print(metrics.confusionMatrix().toArray())

    # Overall statistics
    log['precision'] = "%s" % metrics.precision()
    log['recall'] = "%s" % metrics.recall()
    log['F1 Measure'] = "%s" % metrics.fMeasure()
    print("[Overall]\tprecision = %s | recall = %s | F1 Measure = %s" % \
            (log['precision'], log['recall'], log['F1 Measure']))

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        log[label] = {}
        log[label]['precision'] = "%s" % metrics.precision(label)
        log[label]['recall'] = "%s" % metrics.recall(label)
        log[label]['F1 Measure'] = "%s" % metrics.fMeasure(label, beta=0.5)
        print("[Class %s]\tprecision = %s | recall = %s | F1 Measure = %s" \
                  % (label, log[label]['precision'],
                    log[label]['recall'], log[label]['F1 Measure']))

    return log
コード例 #31
0
def eval_model(test_preds, model):
    """
    Evaluate the ml model given the predictions and test data

    Args:
        test_preds - a list of transformed prediction data
        model - the ml pipelined model
    Returns:
    A confusion matrix, along with the precision, recall and F1 score of the currently trained model
    """
    metrics = MulticlassMetrics(test_preds.select("prediction", "label").rdd)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Confusion matrix")
    print(metrics.confusionMatrix())
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
コード例 #32
0
ファイル: Json.py プロジェクト: honeycombcmu/SparkService
def generateJson(AlgorithmName, taskid, traindata, predictionAndLabels):
	jsonContent = dict()
	jsonContent['AlgorithmName'] = AlgorithmName
	jsonContent['TaskId'] = taskid

	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	jsonContent['LabelNum'] = len(labels)

	metrics = MulticlassMetrics(predictionAndLabels)
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	confusion_matrix = metrics.confusionMatrix().toArray()

	jsonContent['Precision'] = precision
	jsonContent['Recall'] = recall
	jsonContent['F1Score'] = f1Score
	jsonContent['ConfusionMatrix'] = confusion_matrix.tolist()

	jsonContent['Labels'] = list()
	for label in sorted(labels):
		tempList = dict()
		tempList['Precision'] = metrics.precision(label)
		tempList['Recall'] = metrics.recall(label)
		tempList['F1Measure'] = metrics.fMeasure(label, beta=1.0)

		jsonContent['Labels'].append(tempList)
	
	jsonContent['WeightedStats'] = dict()
	jsonContent['WeightedStats']['Precision'] = metrics.weightedRecall
	jsonContent['WeightedStats']['F1Score'] = metrics.weightedFMeasure()
	jsonContent['WeightedStats']['FalsePositiveRate'] = metrics.weightedFalsePositiveRate

	with open(taskid + '.json', 'w') as jsonFile:
		json.dump(jsonContent, jsonFile, indent=4, separators=(',', ': '))
		jsonFile.flush()
コード例 #33
0
pendtvalid = pendtsets[1].cache()

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=20)
dtmodel = dt.fit(pendttrain)

# rootNode is not accessible in Python

dtpredicts = dtmodel.transform(pendtvalid)
dtresrdd = dtpredicts.select("prediction", "label").map(lambda row:  (row.prediction, row.label))

from pyspark.mllib.evaluation import MulticlassMetrics
dtmm = MulticlassMetrics(dtresrdd)
dtmm.precision()
#0.951442968392121
print(dtmm.confusionMatrix())
#DenseMatrix([[ 205.,    0.,    3.,    0.,    0.,    3.,    1.,    0.,    0.,
#                 0.],
#             [   0.,  213.,    0.,    1.,    2.,    1.,    0.,    2.,    0.,
#                 2.],
#             [   0.,    0.,  208.,    0.,    0.,    2.,    0.,    1.,    1.,
#                 0.],
#             [   0.,    1.,    0.,  172.,    3.,    0.,    0.,    0.,    0.,
#                 0.],
#             [   2.,    2.,    1.,    8.,  197.,    0.,    0.,    2.,    3.,
#                 1.],
#             [   1.,    0.,    1.,    0.,    2.,  183.,    0.,    1.,    0.,
#                 1.],
#             [   1.,    0.,    0.,    0.,    0.,    0.,  192.,    1.,    1.,
#                 0.],
#             [   0.,    0.,    0.,    0.,    0.,    0.,    1.,  187.,    5.,
コード例 #34
0
ファイル: NaiveBayes.py プロジェクト: Inscrutive/spark
# MAGIC %md
# MAGIC We can also generate a Confusion Matrix to see the results of the predictions better. ConfusionMatrix() works only with RDDs, so we will have to convert our DataFrame of (prediction, label) into a RDD.
# MAGIC 
# MAGIC confusionMatrix() returns a DenseMatrix with the columns representing the predicted class ordered by ascending class label, and each row represents the actual class ordered by ascending class label. The diagonal from top left to bottom right represents the observations that were predicted correctly. 
# MAGIC 
# MAGIC From the above confusion matrix, we observe that all Setosas (class 0) and Versicolors (class 1) have been classified correctly, but there are 10 Virginicas (class 2) that have been wrongly classified as Versicolors.

# COMMAND ----------

from pyspark.mllib.evaluation import MulticlassMetrics
# Create (prediction, label) pairs
predictionAndLabel = predictions.select("prediction", "label").rdd

# Generate confusion matrix
metrics = MulticlassMetrics(predictionAndLabel)
print metrics.confusionMatrix()


# COMMAND ----------

# MAGIC %md
# MAGIC ####Experimenting with Various Smoothing Parameters
# MAGIC 
# MAGIC We can experiment with various smoothing parameters to see which returns the best result. This is easily done with the ParamGridBuilder and CrossValidator.
# MAGIC 
# MAGIC As we indicate 6 values for the smoothing parameter, this grid will provide 6 parameter settings for CrossValidator to model, evaluate and choose from.

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
コード例 #35
0
ファイル: evaluate.py プロジェクト: stevencox/chemotext
    def train_model (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        labeled = Evaluate.load_all (sc, conf). \
                  map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0,
                                                 features = [ b.paraDist, b.sentDist, b.docDist ] ) )

#        labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \
#                  map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0,
#                                                 [ b, b * 2, b * 9 ] ) )
#        print (labeled.collect ())

        train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345)

        count = train.count ()
        start = time.time ()
        model = LogisticRegressionWithLBFGS.train (train)
        elapsed = time.time () - start
        print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed))

        start = time.time ()
        model_path = os.path.join (conf.output_dir, "eval", "model")
        file_path = model_path.replace ("file://", "")
        if os.path.isdir (file_path):
            print ("Removing existing model {0}".format (file_path))
            shutil.rmtree (file_path)
        model.save(sc, model_path)
        sameModel = LogisticRegressionModel.load(sc, model_path)
        elapsed = time.time () - start
        print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed))


        # Metrics
        labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count())
        print("Training Error => {0}".format (trainErr))

        predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) ))
        metrics = MulticlassMetrics (predictionsAndLabels) 
        print (" --------------> {0}".format (predictionsAndLabels.take (1000)))

        #print (labelsAndPreds.collect ())
        print ("\nMETRICS:")
        try:
            print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0)))
            print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("precision          : {0}".format (metrics.precision(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("recall             : {0}".format (metrics.recall(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("fMeasure           : {0}".format (metrics.fMeasure(0.0, 2.0)))
        except:
            traceback.print_exc ()

        print ("confusion matrix   : {0}".format (metrics.confusionMatrix().toArray ()))
        print ("precision          : {0}".format (metrics.precision()))
        print ("recall             : {0}".format (metrics.recall()))
        print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate))
        print ("weighted precision : {0}".format (metrics.weightedPrecision))
        print ("weighted recall    : {0}".format (metrics.weightedRecall))
        print ("weight f measure   : {0}".format (metrics.weightedFMeasure()))
        print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0)))
        print ("")

        # Regression metrics
        predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) )

        regression_metrics = RegressionMetrics (predictedAndObserved)
        print ("explained variance......: {0}".format (regression_metrics.explainedVariance))
        print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError))
        print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError))
        print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError))
        print ("r2......................: {0}".format (regression_metrics.r2))
        print ("")

        labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features)))
        testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ())
        print ("Testing Error => {0}".format (testErr))