def runMetrics(labeledDataRDD, *args):
    html='<table width=100%><tr><th>Model</th><th>Accuracy</th><th>Precision</th><th>Recall</th></tr>'
    confusionHtml = '<p>Confusion Tables for each Model</p>'
    for model in args:
        label= model.__class__.__name__
        predictionAndLabels = model.predict(labeledDataRDD.map(lambda lp: lp.features))
        metrics = MulticlassMetrics(\
            predictionAndLabels.zip(labeledDataRDD.map(lambda lp: lp.label)).map(lambda t: (float(t[0]),float(t[1])))\
        )
        html+='<tr><td>{0}</td><td>{1:.2f}%</td><td>{2:.2f}%</td><td>{3:.2f}%</td></tr>'\
            .format(label,metrics.weightedFMeasure(beta=1.0)*100, metrics.weightedPrecision*100,metrics.weightedRecall*100 )

        if ( displayConfusionTable ):
            confusionMatrix = metrics.call("confusionMatrix")
            confusionMatrixArray = confusionMatrix.toArray()
            #labels = metrics.call("labels")
            confusionHtml += "<p>" + label + "<p>"
            confusionHtml += "<table>"
            for row in confusionMatrixArray:
                confusionHtml += "<tr>"
                for cell in row:
                    confusionHtml+="<td>" + str(cell) + "</td>"
                confusionHtml += "</tr>"
            confusionHtml += "</table>"
        
    html+='</table>'
    
    if ( displayConfusionTable ):
        html+=confusionHtml
    
    display(HTML(html))
def evaluate(labelsAndPredictions, data, labels):
    """
    Evaluation Metrics
    """
    # Instantiate metrics object
    metrics = MulticlassMetrics(labelsAndPredictions)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    # Statistics by class
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
def evaluate(predictions):
    """
    Evaluation Metrics
    """
    # label to indexedLabel mappings
    # out = sorted(set([(i[0], i[1]) for i in predictions.select(predictions.label, predictions.indexedLabel).collect()]), key=lambda x: x[0])

    print "Predictions"
    predictions.select("prediction", "indexedLabel", "features").show(5)

    # Select (prediction, true label) and evaluate model
    predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd
    metrics = MulticlassMetrics(predictionAndLabels)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    # Statistics by class
    labels = predictions.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    treeModel = model.stages[2]
    print treeModel # summary only
def testClassification(train, test):
    # Train a RandomForest model.
    # Setting featureSubsetStrategy="auto" lets the algorithm choose.
    # Note: Use larger numTrees in practice.

    rf = RandomForestClassifier(labelCol="indexedLabel", numTrees=3, maxDepth=4)

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = MulticlassMetrics(predictionAndLabels)
    print("weighted f-measure %.3f" % metrics.weightedFMeasure())
    print("precision %s" % metrics.precision())
    print("recall %s" % metrics.recall())
 def doRender(self, handlerId):
     html='<div class="pd_save"><table width=100%><tr><th>Model</th><th>Accuracy</th><th>Precision</th><th>Recall</th></tr>'
     confusionHtml = '<p>Confusion Tables for each Model</p>'
     for modelName,model in Configuration.getModels():
         label= model.__class__.__name__
         labeledDataRDD, sqlTableName = Configuration.getLabeledData(self.entity)
         predictionAndLabels = model.predict(labeledDataRDD.map(lambda lp: lp.features))
         metrics = MulticlassMetrics(\
             predictionAndLabels.zip(labeledDataRDD.map(lambda lp: lp.label)).map(lambda t: (float(t[0]),float(t[1])))\
         )
         html+='<tr><td>{0}</td><td>{1:.2f}%</td><td>{2:.2f}%</td><td>{3:.2f}%</td></tr>'\
             .format(label,metrics.weightedFMeasure(beta=1.0)*100, metrics.weightedPrecision*100,metrics.weightedRecall*100 )
         displayConfusionTable = True
         if ( displayConfusionTable ):
             #get labels from RDD
             handler=training.getTrainingHandler()
             classLabels = labeledDataRDD.map(lambda t: t.label).distinct().map(lambda l: handler.getClassLabel(l)).collect()
             confusionMatrix = metrics.call("confusionMatrix")
             confusionMatrixArray = confusionMatrix.toArray()
             #labels = metrics.call("labels")
             confusionHtml += "<p>" + label + "<p>"
             confusionHtml += "<table>"
             confusionHtml+="<tr><td></td>"
             for classLabel in classLabels:
                 confusionHtml+="<td>" + str(classLabel) + "</td>"
             confusionHtml+="</tr>"
             
             for i, row in enumerate(confusionMatrixArray):
                 confusionHtml += "<tr>"
                 confusionHtml += "<td>" + classLabels[i] + "</td>"
                 for j, cell in enumerate(row):
                     confusionHtml+="<td style='text-align:center'>" + ("<b>" if (i==j) else "") +  str(cell) + ("</b>" if (i==j) else "") + "</td>"
                 confusionHtml += "</tr>"
             confusionHtml += "</table>"
         
     html+='</table></div>'
     
     if ( displayConfusionTable ):
         html+=confusionHtml
     
     self._addHTML(html)
Beispiel #6
0
def printMetrics(pred_and_label):
    metrics = MulticlassMetrics(pred_and_label)
    print 'Preicision of 0', metrics.precision(0)
    print 'Preicision of 1', metrics.precision(1)
    print 'Preicision of 2', metrics.precision(2)
    print 'Preicision of 3', metrics.precision(3)
    print 'Recall of 0', metrics.recall(0)
    print 'Recall of 1', metrics.recall(1)
    print 'Recall of 2', metrics.recall(2)
    print 'Recall of 3', metrics.recall(3)
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()
def train(model, model_name):
    pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx,model])
    (trainingData, testData) = df.randomSplit([0.7, 0.3], seed = 100)
    pipelineFit = pipeline.fit(trainingData)
    predictions = pipelineFit.transform(testData)
    evaluator = MulticlassClassificationEvaluator()
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    predictions.show(20)
    path= model_name+".model"
    pipelineFit.write().overwrite().save(path)
    print(model_name+": ",str(accuracy))
    predictionAndLabels = predictions.select("prediction", "label").rdd.map(lambda r : (r[0], r[1]))
    metrics = MulticlassMetrics(predictionAndLabels)

    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()

    print("Summary:")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
Beispiel #8
0
def printMetrics(predictions_and_labels):
    metrics = MulticlassMetrics(predictions_and_labels)
    print 'Precision of True ', metrics.precision(1)
    print 'Precision of False', metrics.precision(0)
    print 'Recall of True    ', metrics.recall(1)
    print 'Recall of False   ', metrics.recall(0)
    print 'F-1 Score         ', metrics.fMeasure()
    print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()
Beispiel #9
0
def print_performance_metrics(predictions):
    # Evaluate model
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
    auc = evaluator.evaluate(predictions,
                             {evaluator.metricName: "areaUnderROC"})
    aupr = evaluator.evaluate(predictions,
                              {evaluator.metricName: "areaUnderPR"})
    print("auc = {}".format(auc))
    print("aupr = {}".format(aupr))

    # get rdd of predictions and labels for mllib eval metrics
    predictionAndLabels = predictions.select("prediction", "label").rdd

    # Instantiate metrics objects
    binary_metrics = BinaryClassificationMetrics(predictionAndLabels)
    multi_metrics = MulticlassMetrics(predictionAndLabels)

    # Area under precision-recall curve
    print("Area under PR = {}".format(binary_metrics.areaUnderPR))
    # Area under ROC curve
    print("Area under ROC = {}".format(binary_metrics.areaUnderROC))
    # Accuracy
    print("Accuracy = {}".format(multi_metrics.accuracy))
    # Confusion Matrix
    print(multi_metrics.confusionMatrix())

    ### Question 5.1 Answer ###

    # F1
    print("F1 = {}".format(multi_metrics.weightedFMeasure()))
    # Precision
    print("Precision = {}".format(multi_metrics.weightedPrecision))
    # Recall
    print("Recall = {}".format(multi_metrics.weightedRecall))
    # FPR
    print("FPR = {}".format(multi_metrics.weightedFalsePositiveRate))
    # TPR
    print("TPR = {}".format(multi_metrics.weightedTruePositiveRate))
Beispiel #10
0
def get_metrics(df, lower_bound, upper_bound=1.0):
    rdd = df.select("prediction", "Profit").rdd
    metrics = MulticlassMetrics(rdd)
    metrics_dict = {}
    cm = metrics.confusionMatrix().toArray()

    TP = cm[0][0]
    TN = cm[1][1]
    FP = cm[0][1]
    FN = cm[1][0]
    accuracy = (TP + TN) / cm.sum()
    if accuracy < lower_bound or accuracy > upper_bound:
        return None
    sensitivity = (TP) / (TP + FN)
    specificity = (TN) / (TN + FP)
    precision = (TP) / (TP + FP)
    npv = (TN) / (TN + FN)

    # Overall statistics
    metrics_dict['accuracy'] = accuracy

    metrics_dict['sensitivity'] = sensitivity

    metrics_dict['specificity'] = specificity

    metrics_dict['precision'] = precision

    metrics_dict['npv'] = npv

    # print("Summary Stats")
    # print(metrics.confusionMatrix())
    metrics_dict['confusionMatrix'] = metrics.confusionMatrix()

    print(
        "{},{},{},{},{}".format(round(accuracy, 3), round(sensitivity, 3), round(specificity, 3), round(precision, 3),
                                round(npv, 3)))

    return metrics_dict
Beispiel #11
0
def get_results(train, test):
    results = []
    for i in range(len(train)):
        training = train[i].filter(lambda x: not np.isnan(x.features[0])
                                   ).filter(lambda x: x.features[0] > 0.0)
        testing = test[i].filter(lambda x: not np.isnan(x.features[0])).filter(
            lambda x: x.features[0] > 0.0)
        model = RandomForest.trainClassifier(training,
                                             numClasses=2,
                                             categoricalFeaturesInfo={},
                                             numTrees=20,
                                             featureSubsetStrategy="auto",
                                             impurity='gini',
                                             maxDepth=10,
                                             maxBins=32)
        test_preds = (testing.map(lambda x: x.label).zip(
            model.predict(testing.map(lambda x: x.features))))
        test_metrics = MulticlassMetrics(
            test_preds.map(lambda x: (x[0], float(x[1]))))
        answer = str(test_metrics.precision()) + '\n' + str(
            test_metrics.confusionMatrix().toArray()) + '\n'
        results.append(answer)
    return sc.parallelize(results)
Beispiel #12
0
def statistics(test_data, prediction_data):
    # Compute raw scores on the test set
    prediction_and_labels = test_data.join(prediction_data, 'test_id').rdd.map(
        lambda x: (float(x.prediction[0]), float(x.test_labels[0])))

    # Instantiate metrics object
    metrics = MulticlassMetrics(prediction_and_labels)

    # Overall statistics
    print("Summary Statistics\n")
    summary_statistics(metrics)

    # Statistics by class
    print("\nClass Summary Statistics\n")
    label_statistics(metrics, labels)
Beispiel #13
0
 def evaluate_model_simple(self, test):
     '''
     generate tpr, fpr, fnr, and tpr for each threshold
     --------
     Parameters:
     test: spark.df post vectorization
     number_of_iterations: number of threshold values between .001 and 1.00 utilized in roc curve
     --------
     Returns:
     list-of-dict - containing rate of pthres, tp, fp, fn, tn
     '''
     score_model = {}
     predictionAndLabels = test.rdd.map(
         lambda lp: (float(self.model.predict(lp.features)), lp.label))
     # Instantiate metrics object
     metrics = BinaryClassificationMetrics(predictionAndLabels)
     metrics2 = MulticlassMetrics(predictionAndLabels)
     # Area under precision-recall curve
     score_model['precision_recall'] = metrics.areaUnderPR
     # Area under ROC curve
     score_model["ROC_area"] = metrics.areaUnderROC
     score_model['tpr'] = metrics2.truePositiveRate('label')
     score_model['fpr'] = metrics2.falsePositiveRate('label')
     return score_model
Beispiel #14
0
def print_f1(df, total_columns, classifier='gbt'):
    label_column = total_columns[-1]
    if classifier == 'gbt':
        predictionAndLabels = df.select(
            ['prediction_with_round', total_columns[-1]])
    else:
        predictionAndLabels = df.select(['indexedLabel', "prediction"])
    labels = df.select([label_column]).distinct()
    header = labels.rdd.first()
    labels = labels.rdd.filter(lambda line: line != header)
    header = predictionAndLabels.rdd.first()
    copy_predictionAndLabels = predictionAndLabels.rdd.filter(
        lambda line: line != header)
    copy_predictionAndLabel = copy_predictionAndLabels.map(
        lambda lp: (float(lp[0]), float(lp[1])))
    metrics = MulticlassMetrics(copy_predictionAndLabel)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
Beispiel #15
0
def evaluate(sc, model, labeled_points):
    labeled_points = labeled_points.union(sc.parallelize([]))
    labels = labeled_points.map(lambda lp: lp.label)
    predictions = model.predict(labeled_points.map(lambda lp: lp.features))
    predictions, labels = predictions.union(sc.parallelize([])), labels.union(
        sc.parallelize([]))
    predictionAndLabels = predictions.zip(labels)

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    cm = metrics.confusionMatrix().toArray()
    recall = cm[1][1] / (cm[1][0] + cm[1][1])
    precision = cm[1][1] / (cm[0][1] + cm[1][1])
    f1Score = 2. * (precision * recall) / (precision + recall)
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    print("Confusion Matrix = ")
    print("    0        1")
    print("0   {0}      {1}".format(cm[0][0], cm[1][0]))
    print("1   {0}      {1}".format(cm[0][1], cm[1][1]))
def evaluate(predictionAndLabels):
    log = {}

    # Show Validation Score (AUROC)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
    log['AUROC'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under ROC = {}".format(log['AUROC']))

    # Show Validation Score (AUPR)
    evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
    log['AUPR'] = "%f" % evaluator.evaluate(predictionAndLabels)
    print("Area under PR = {}".format(log['AUPR']))

    # Metrics
    predictionRDD = predictionAndLabels.select(['label', 'prediction']) \
                            .rdd.map(lambda line: (line[1], line[0]))
    metrics = MulticlassMetrics(predictionRDD)

    # Confusion Matrix
    print(metrics.confusionMatrix().toArray())

    # Overall statistics
    log['precision'] = "%s" % metrics.precision()
    log['recall'] = "%s" % metrics.recall()
    log['F1 Measure'] = "%s" % metrics.fMeasure()
    print("[Overall]\tprecision = %s | recall = %s | F1 Measure = %s" % \
            (log['precision'], log['recall'], log['F1 Measure']))

    # Statistics by class
    labels = [0.0, 1.0]
    for label in sorted(labels):
        log[label] = {}
        log[label]['precision'] = "%s" % metrics.precision(label)
        log[label]['recall'] = "%s" % metrics.recall(label)
        log[label]['F1 Measure'] = "%s" % metrics.fMeasure(label, beta=0.5)
        print("[Class %s]\tprecision = %s | recall = %s | F1 Measure = %s" \
                  % (label, log[label]['precision'],
                    log[label]['recall'], log[label]['F1 Measure']))

    return log
def validate_tffm(spark, sc, model, test_df, s3_metrics_path, s3_endpoint_path):
    # get predictions
    validation_df = model.transform(test_df)
    
    metricsSchema = StructType() \
        .add("metric", StringType()) \
        .add("value", DoubleType())
    metrics_names = []

    # apply threshold
    def thresholdScore(x):
        retval = 0.0
        if x > 0.5:
            retval = 1.0
        return retval
    
    thresholdScoreUdf = F.UserDefinedFunction(thresholdScore, T.FloatType())
    
    validation_df_round = validation_df.withColumn('rscore', thresholdScoreUdf(validation_df.score)) 
    predTffm = validation_df_round.select(['label','rscore'])

    predictionAndLabelsTffm = predTffm.rdd.map(lambda lp: (lp.rscore, lp.label))
    metricsTffm = BinaryClassificationMetrics(predictionAndLabelsTffm)

    metrics_names.append(("Area_under_PR",metricsTffm.areaUnderPR))
    metrics_names.append(("Area_under_ROC",metricsTffm.areaUnderROC))

    mmetricsTffm = MulticlassMetrics(predictionAndLabelsTffm)
    metrics_names.append(("Precision",mmetricsTffm.precision()))
    metrics_names.append(("Recall",mmetricsTffm.recall()))
    metrics_names.append(("F1",mmetricsTffm.fMeasure()))
    metrics_names.append(("Weighted_recall",mmetricsTffm.weightedRecall))
    metrics_names.append(("Weighted_precision",mmetricsTffm.weightedPrecision))
    metrics_names.append(("Weighted_F1",mmetricsTffm.weightedFMeasure()))
    metrics_names.append(("Weighted_F05",mmetricsTffm.weightedFMeasure(beta=0.5)))
    metrics_names.append(("Weighted_FP_rate",mmetricsTffm.weightedFalsePositiveRate))

    mRdd = sc.parallelize(metrics_names).coalesce(1)
    dfMetrics = spark.createDataFrame(mRdd, metricsSchema)
    dfMetrics.write.csv("{0}/{1}".format(s3_metrics_path, model.endpointName), mode="overwrite")

    endpointSchema = StructType() \
        .add("time", StringType()) \
        .add("endpoint", StringType())
    endpoint_name = []
    endpoint_name.append((str(time.time()),str(model.endpointName)))
    eRdd = sc.parallelize(endpoint_name).coalesce(1)
    dfEndpoint = spark.createDataFrame(eRdd, endpointSchema)
    dfEndpoint.write.csv("{0}/endpoint.txt".format(s3_endpoint_path), mode="overwrite")
Beispiel #18
0
def test_spark_ml_model_classification(spark_context, classification_model,
                                       mnist_data):
    batch_size = 64
    nb_classes = 10
    epochs = 1

    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    df = to_data_frame(spark_context, x_train, y_train, categorical=True)
    test_df = to_data_frame(spark_context, x_test, y_test, categorical=True)

    sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    sgd_conf = optimizers.serialize(sgd)

    # Initialize Spark ML Estimator
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(classification_model.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(nb_classes)

    # Fitting a model returns a Transformer
    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)

    # Evaluate Spark model by evaluating the underlying model
    prediction = fitted_pipeline.transform(test_df)
    pnl = prediction.select("label", "prediction")
    pnl.show(100)

    # since prediction in a multiclass classification problem is a vector, we need to compute argmax
    # the casting to a double is just necessary for using MulticlassMetrics
    pnl = pnl.select(
        'label',
        argmax('prediction').astype(DoubleType()).alias('prediction'))
    prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction))
    metrics = MulticlassMetrics(prediction_and_label)
    print(metrics.accuracy)
def printStatistics(labelsAndPredictions, data):
    metrics = MulticlassMetrics(labelsAndPredictions)
    labels = data.map(lambda lp: lp.label).distinct().collect()
    print("confusion metrics:")
    cm = metrics.confusionMatrix()
    print(cm)
    print('')
    print('accuracy: ' + str(metrics.accuracy))
    for label in labels:
        print('label: ' + str(label))
        print('fp: ' + str(metrics.falsePositiveRate(label)))
        print('tp: ' + str(metrics.truePositiveRate(label)))
    recall = metrics.recall()
    precision = metrics.precision()
    print("Recall = %s" % recall)
    print("Precision = %s" % precision)
Beispiel #20
0
    def evaluateClassification(self, predictionAndLabels):

        metrics = MulticlassMetrics(predictionAndLabels)
        cm = metrics.confusionMatrix()

        result = {}

        result['Matrix'] = cm.toArray().tolist()
        result['Precision'] = metrics.precision()
        result['Recall'] = metrics.recall()
        result['F1 Score'] = metrics.fMeasure()

        return result
Beispiel #21
0
 def performancerdd(self):
     self.calculator = 'RDDs'
     print('Calculating performance metrics using RDDs...')
     predictionRDD = self.predictions.select(['label','prediction']).rdd.map(lambda line: (line[1],line[0]))
     
     binmetrics = BinaryClassificationMetrics(predictionRDD)
     metrics = MulticlassMetrics(predictionRDD)
     
     self.areaUnderROC = binmetrics.areaUnderROC
     self.areaUnderPR = binmetrics.areaUnderPR
     self.confusionMatrix = metrics.confusionMatrix().toArray()
     self.accuracy = metrics.accuracy
     self.precision = metrics.precision()
     self.recall = metrics.recall()
     self.f1measure = metrics.fMeasure()
     self.falsePositive = metrics.falsePositiveRate(1.0)
     self.falseNegative = metrics.falsePositiveRate(0.0)
Beispiel #22
0
def generateJson(AlgorithmName, taskid, traindata, predictionAndLabels):
	jsonContent = dict()
	jsonContent['AlgorithmName'] = AlgorithmName
	jsonContent['TaskId'] = taskid

	labels = traindata.map(lambda lp: lp.label).distinct().collect()
	jsonContent['LabelNum'] = len(labels)

	metrics = MulticlassMetrics(predictionAndLabels)
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	confusion_matrix = metrics.confusionMatrix().toArray()

	jsonContent['Precision'] = precision
	jsonContent['Recall'] = recall
	jsonContent['F1Score'] = f1Score
	jsonContent['ConfusionMatrix'] = confusion_matrix.tolist()

	jsonContent['Labels'] = list()
	for label in sorted(labels):
		tempList = dict()
		tempList['Precision'] = metrics.precision(label)
		tempList['Recall'] = metrics.recall(label)
		tempList['F1Measure'] = metrics.fMeasure(label, beta=1.0)

		jsonContent['Labels'].append(tempList)
	
	jsonContent['WeightedStats'] = dict()
	jsonContent['WeightedStats']['Precision'] = metrics.weightedRecall
	jsonContent['WeightedStats']['F1Score'] = metrics.weightedFMeasure()
	jsonContent['WeightedStats']['FalsePositiveRate'] = metrics.weightedFalsePositiveRate

	with open(taskid + '.json', 'w') as jsonFile:
		json.dump(jsonContent, jsonFile, indent=4, separators=(',', ': '))
		jsonFile.flush()
def amazon_classification(sc, filename):
    '''
    Args:
        sc: The Spark Context
        filename: Filename of the Amazon reviews file to use, where each line represents a review
    '''
    # Load in reviews
    reviews = sc.textFile(filename).sample(False, 0.001)

    # Parse to csv
    csv_loads = reviews.map(loadcsv)

    #
    labeled_data = (csv_loads.filter(lambda x: x != None).mapValues(lambda x: x.split()))

    labels = labeled_data.keys()

    tf = HashingTF().transform(labeled_data.map(lambda x:x[1]))
    idf = IDF(minDocFreq=7).fit(tf)
    tfidf = idf.transform(tf)
    labeled_points = (labels.zip(tfidf)
                         .map(lambda x: LabeledPoint(float(x[0]), x[1])))

    training, test = labeled_points.randomSplit([0.6, 0.4])

    model = NaiveBayes.train(training)

    # Use our model to predict
    train_preds = (training.map(lambda x: x.label)
                           .zip(model.predict(training.map(lambda x: x.features))))
    test_preds = (test.map(lambda x: x.label)
                      .zip(model.predict(test.map(lambda x: x.features))))

    # Ask PySpark for some metrics on how our model predictions performed
    trained_metrics = MulticlassMetrics(train_preds.map(lambda x: (x[0], float(x[1]))))
    test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1]))))
    ojbk = open('./xxx.txt','w+')
    ojbk.write(str(trained_metrics.confusionMatrix().toArray()) + '\n')
    ojbk.write(str(trained_metrics.precision()) + '\n')
    ojbk.write(str(test_metrics.confusionMatrix().toArray()) + '\n')
    objk.write(str(test_metrics.precision()) + '\n')
Beispiel #24
0
 def predict(self):
     #print self.predictingData.show()
     predictions = self.model.transform(self.predictingData)
     #print predictions.show()
     #df= predictions.select('prediction').collect()
     #return df[0].asDict()["prediction"]
     predictions.select("URL", "prediction", "indexedLabel",
                        "label").show(200)
     predictionAndLabels = predictions.select("prediction",
                                              "indexedLabel").rdd
     metrics = MulticlassMetrics(predictionAndLabels)
     print("TPR: {:.3%} \tFPR: {:.3%}".format(
         metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0)))
     print("TNR: {:.3%} \tFNR: {:.3%}".format(
         metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0)))
     print("Confusion Matrix:")
     for line in metrics.confusionMatrix().toArray():
         print(line)
Beispiel #25
0
def modelStatistics(labelsAndPredictions):
    metrics = MulticlassMetrics(labelsAndPredictions)
    print(metrics.confusionMatrix())

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
def pred_precision_kaggle(prediction,NumCluster):
    
    pred_label = prediction.rdd.map(lambda
                                    x: (float(np.argsort(-1*x.prediction)[:1]),
                                        float((x.country_destination_indexed))))
    metrics = MulticlassMetrics(pred_label)
    avg_precision = metrics.precision()
    
    for i in range(1,NumCluster):
        pred_label = prediction.rdd.map(lambda
                                        x: (float(np.argsort(-1*x.probability)[i:(i+1)]),
                                            float(x.country_destination_indexed)))
        metrics = MulticlassMetrics(pred_label)
        avg_precision += metrics.precision()
        
    return avg_precision
Beispiel #27
0
def performance(predictions):
    predictionRDD = predictions.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0]))
        
    binmetrics = BinaryClassificationMetrics(predictionRDD)
    metrics = MulticlassMetrics(predictionRDD)
    
    results = {'predictions':predictions,
               'areaUnderROC':binmetrics.areaUnderROC,
               'areaUnderPR':binmetrics.areaUnderPR,
               'confusionMatrix':metrics.confusionMatrix().toArray(),
               'accuracy':metrics.accuracy,
               'precision':metrics.precision(),
               'recall':metrics.recall(),
               'f1measure':metrics.fMeasure()}
    
    return results
Beispiel #28
0
def test_functional_model(spark_context, classification_model_functional,
                          mnist_data):
    batch_size = 64
    epochs = 1

    x_train, y_train, x_test, y_test = mnist_data
    x_train = x_train[:1000]
    y_train = y_train[:1000]
    df = to_data_frame(spark_context, x_train, y_train, categorical=True)
    test_df = to_data_frame(spark_context, x_test, y_test, categorical=True)

    sgd = optimizers.SGD()
    sgd_conf = optimizers.serialize(sgd)
    estimator = ElephasEstimator()
    estimator.set_keras_model_config(classification_model_functional.to_yaml())
    estimator.set_optimizer_config(sgd_conf)
    estimator.set_mode("synchronous")
    estimator.set_loss("categorical_crossentropy")
    estimator.set_metrics(['acc'])
    estimator.set_epochs(epochs)
    estimator.set_batch_size(batch_size)
    estimator.set_validation_split(0.1)
    estimator.set_categorical_labels(True)
    estimator.set_nb_classes(10)
    pipeline = Pipeline(stages=[estimator])
    fitted_pipeline = pipeline.fit(df)
    prediction = fitted_pipeline.transform(test_df)
    pnl = prediction.select("label", "prediction")
    pnl = pnl.select(
        'label',
        argmax('prediction').astype(DoubleType()).alias('prediction'))
    pnl.show(100)

    prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction))
    metrics = MulticlassMetrics(prediction_and_label)
    print(metrics.accuracy)
def eval_model(test_preds, model):
    """
    Evaluate the ml model given the predictions and test data

    Args:
        test_preds - a list of transformed prediction data
        model - the ml pipelined model
    Returns:
    A confusion matrix, along with the precision, recall and F1 score of the currently trained model
    """
    metrics = MulticlassMetrics(test_preds.select("prediction", "label").rdd)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Confusion matrix")
    print(metrics.confusionMatrix())
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)
    lambda x: x[0] and x[1]).map(lambda x: (float(x[0]), x[1])).mapValues(
        nltk.word_tokenize))
labels = labeled_data.map(lambda x: x[0])

tfidf = produce_tfidf(labeled_data.map(lambda x: x[1]))
zipped_data = (
    labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])).cache())

# Do a random split so we can test our model on non-trained data
training, test = zipped_data.randomSplit([0.7, 0.3])

# Train our model
model = NaiveBayes.train(training)

# Use our model to predict
train_preds = (training.map(lambda x: x.label).zip(
    model.predict(training.map(lambda x: x.features))))
test_preds = (test.map(lambda x: x.label).zip(
    model.predict(test.map(lambda x: x.features))))

# Ask PySpark for some metrics on how our model predictions performed
trained_metrics = MulticlassMetrics(
    train_preds.map(lambda x: (x[0], float(x[1]))))
test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1]))))

with open('output_discrete.txt', 'w+') as f:
    f.write(str(trained_metrics.confusionMatrix().toArray()) + '\n')
    f.write(str(trained_metrics.precision()) + '\n')
    f.write(str(test_metrics.confusionMatrix().toArray()) + '\n')
    f.write(str(test_metrics.precision()) + '\n')
    print("Data prepared.\n")

    # create models for one-vs-rest SVM binary classifiers
    print("Preparing models\n")
    models = [model_per_class(i, labelled_training_data) for i in range(1, 7)]
    print("Models prepared.\n")

    # make predictions for testing data
    print("Making predictions.\n")
    predictions = labelled_testing_data.map(
        lambda x: (float(np.argmax([model.predict(x.features) for model in models]) + 1), x.label))
    print("Predictions completed.\n")

    # calculate precision, recall, and f-measure
    print("Calculating evaluation metrics for feature set 1.\n")
    metrics = MulticlassMetrics(predictions)

    print("F-Measure: ", metrics.fMeasure())
    print("Confusion matrix\n\n")
    plot.plot_confusion_matrix(metrics.confusionMatrix().toArray(), "cm1_refactored.png")

    for i in range(1, 7):
        print("Precision for ", i, " is ", metrics.precision(i))
        print("Recall for ", i, " is ", metrics.recall(i))
        print("f-measure for ", i, " is ", metrics.fMeasure(float(i)), "\n")
        precision.append(metrics.precision(i))
        recall.append(metrics.recall(i))
        fmeasure.append(metrics.fMeasure(float(i)))
    plot.plot_per_activity_metric(precision, recall, fmeasure, "fs1_refactored.png")
    precision = []
    recall = []
#
# evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
#
# # Create 7-fold CrossValidator
# cv = CrossValidator(estimator=lr, \
#                     estimatorParamMaps=paramGrid, \
#                     evaluator=evaluator, \
#                     numFolds=5)

lrModel = lr.fit(train)
# cvModel = cv.fit(train)
# rfModel = rf.fit(train)

predictions = lrModel.transform(test)
# predictions = cvModel.transform(test)
# predictions = rfModel.transform(test)

results = predictions.select(['prediction', 'label'])
predictionAndLabels = results.rdd

metrics = MulticlassMetrics(predictionAndLabels)

cm = metrics.confusionMatrix().toArray()
accuracy = (cm[0][0] + cm[1][1]) / cm.sum()
precision = (cm[0][0]) / (cm[0][0] + cm[1][0])
recall = (cm[0][0]) / (cm[0][0] + cm[0][1])
f1score = 2 * ((precision * recall) / (precision + recall))

# print(evaluator.evaluate(predictions))
print("Classifier: accuracy, precision, recall, f1score", accuracy, precision,
      recall, f1score)
Beispiel #33
0
trainer = ADAG(keras_model=model, worker_optimizer='adam', loss='categorical_crossentropy',
               num_workers=1, batch_size=100, communication_window=5, num_epoch=50,
               features_col="matrix", label_col="label_encoded"
               )
trained_model = trainer.train(training_set)
from distkeras.predictors import *
from distkeras.transformers import *
from distkeras.evaluators import *
from distkeras.utils import *

print("Training time: " + str(trainer.get_training_time()))
print("Accuracy: " + str(evaluate_accuracy(trained_model, test_set)))
print("Number of parameter server updates: " + str(trainer.parameter_server.num_updates))

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[string_indexer, scaler, trainer_model])

from pyspark.mllib.evaluation import MulticlassMetrics

fitted_pipeline = pipeline.fit(dataset_train) # Fit model to data

prediction = fitted_pipeline.transform(dataset_train) # Evaluate on train data.
# prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data.
pnl = prediction.select("index_category", "prediction")
pnl.show(100)

prediction_and_label = pnl.map(lambda row: (row.index_category, row.prediction))
metrics = MulticlassMetrics(prediction_and_label)
print(metrics.precision())
Beispiel #34
0
    data = sc.textFile('/usr/sfd_train_one_hot.csv').map(parseLine)
    
    # data = sc.textFile('/usr/sfd_0.csv').map(parseLine)

    # Split data into training (60%) and test (40%)
    training, test = data.randomSplit([0.85, 0.15], seed=11L)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=39)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)
    
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    #accuracy = metrics.accuracy
    accuracy = 1.0 * predictionAndLabels.filter(lambda (x, v): x == v).count() / test.count()
    # print("Summary Stats")
    # print("Precision = %s" % precision)
    # print("Recall = %s" % recall)
    # print("F1 Score = %s" % f1Score)    
    # print("Accuracy = %s" % accuracy)
    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    # for label in sorted(labels):
indexed2.show()


# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=indexed2.columns[1:], outputCol="features")
data = assembler.transform(indexed2)

# Split data into training and test data set
training, test = data.select("label", "features").randomSplit([0.6, 0.4])

# Create Decision tree model and fit the model with training dataset
dt = DecisionTreeClassifier()
model = dt.fit(training)

# Generate prediction from test dataset
predictions = model.transform(test)

# Evuluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)

# Show model accuracy
print("Accuracy:", accuracy)

# Report
predictionAndLabels = predictions.select("label", "prediction").rdd
metrics = MulticlassMetrics(predictionAndLabels)
print("Confusion Matrix:", metrics.confusionMatrix())
print("Precision:", metrics.precision())
print("Recall:", metrics.recall())
print("F-measure:", metrics.fMeasure())
Beispiel #36
0
df = to_data_frame(sc, x_train, y_train, categorical=True)
test_df = to_data_frame(sc, x_test, y_test, categorical=True)

# Initialize Spark ML Estimator
adadelta = elephas_optimizers.Adadelta()
estimator = ElephasEstimator(sc,
                             model,
                             nb_epoch=nb_epoch,
                             batch_size=batch_size,
                             optimizer=adadelta,
                             frequency='batch',
                             mode='asynchronous',
                             num_workers=2,
                             verbose=0,
                             validation_split=0.1,
                             categorical=True,
                             nb_classes=nb_classes)

# Fitting a model returns a Transformer
fitted_model = estimator.fit(df)

# Evaluate Spark model by evaluating the underlying model
prediction = fitted_model.transform(test_df)
pnl = prediction.select("label", "prediction")
pnl.show(100)

prediction_and_label = pnl.map(lambda row: (row.label, row.prediction))
metrics = MulticlassMetrics(prediction_and_label)
print(metrics.precision())
print(metrics.recall())
pendtsets = pendtlpoints.randomSplit([0.8, 0.2])
pendttrain = pendtsets[0].cache()
pendtvalid = pendtsets[1].cache()

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(maxDepth=20)
dtmodel = dt.fit(pendttrain)

# rootNode is not accessible in Python

dtpredicts = dtmodel.transform(pendtvalid)
dtresrdd = dtpredicts.select("prediction", "label").map(lambda row:  (row.prediction, row.label))

from pyspark.mllib.evaluation import MulticlassMetrics
dtmm = MulticlassMetrics(dtresrdd)
dtmm.precision()
#0.951442968392121
print(dtmm.confusionMatrix())
#DenseMatrix([[ 205.,    0.,    3.,    0.,    0.,    3.,    1.,    0.,    0.,
#                 0.],
#             [   0.,  213.,    0.,    1.,    2.,    1.,    0.,    2.,    0.,
#                 2.],
#             [   0.,    0.,  208.,    0.,    0.,    2.,    0.,    1.,    1.,
#                 0.],
#             [   0.,    1.,    0.,  172.,    3.,    0.,    0.,    0.,    0.,
#                 0.],
#             [   2.,    2.,    1.,    8.,  197.,    0.,    0.,    2.,    3.,
#                 1.],
#             [   1.,    0.,    1.,    0.,    2.,  183.,    0.,    1.,    0.,
#                 1.],
Beispiel #38
0
	
	# Split data into training (60%) and test (40%)
	traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3)

	# Compute raw scores on the test set
	predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	# Instantiate metrics object
	metrics = MulticlassMetrics(predictionAndLabels)

	# Overall statistics
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	#confusion_matrix = metrics.confusionMatrix().toArray()

	print("Summary Stats")
	print("Precision = %s" % precision)
	print("Recall = %s" % recall)
	print("F1 Score = %s" % f1Score)


	# Statistics by class
	labels = traindata.map(lambda lp: lp.label).distinct().collect()
Beispiel #39
0
#学習した後の判別用
res = test.map(lambda data: model.predict(data.features))

#モデルを学習
#model = LogisticRegressionWithLBFGS.train(train,10)#ロジスティック回帰(L-BFGS)で
#model = SVMWithSGD.train(train,5000)#SVMで
model = NaiveBayes.train(train, 10.0)


#print(len(np.array(model.weights)))

#テストデータの推定結果を取得
Re_l = res.collect()

#評価用に(推定ラベル,正解ラベル)のペアのタプルが並んだリストを生成
hako = []
for i,v in enumerate(test.collect()):
    hako.append((float(Re_l[i]), v.label))

#そのリストをspark用に変換,メトリクスのクラスに渡す
predictionAndLabels = sc.parallelize(hako)
metrics = MulticlassMetrics(predictionAndLabels)

#メトリクスのメソッドを使って再現率と適合率を計算&出力
print("-------------------------------------")
print("recall = {}".format(metrics.recall(0.)))#スパムメールの検出率
print("precision = {}".format(metrics.precision(0.)))#スパムでないメールの認識率
print("-------------------------------------")

sc.stop()
Beispiel #40
0
    def train_model (conf):
        sc = SparkUtil.get_spark_context (conf.spark_conf)
        conf.output_dir = conf.output_dir.replace ("file:", "")
        conf.output_dir = "file://{0}".format (conf.output_dir)

        labeled = Evaluate.load_all (sc, conf). \
                  map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0,
                                                 features = [ b.paraDist, b.sentDist, b.docDist ] ) )

#        labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \
#                  map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0,
#                                                 [ b, b * 2, b * 9 ] ) )
#        print (labeled.collect ())

        train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345)

        count = train.count ()
        start = time.time ()
        model = LogisticRegressionWithLBFGS.train (train)
        elapsed = time.time () - start
        print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed))

        start = time.time ()
        model_path = os.path.join (conf.output_dir, "eval", "model")
        file_path = model_path.replace ("file://", "")
        if os.path.isdir (file_path):
            print ("Removing existing model {0}".format (file_path))
            shutil.rmtree (file_path)
        model.save(sc, model_path)
        sameModel = LogisticRegressionModel.load(sc, model_path)
        elapsed = time.time () - start
        print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed))


        # Metrics
        labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count())
        print("Training Error => {0}".format (trainErr))

        predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) ))
        metrics = MulticlassMetrics (predictionsAndLabels) 
        print (" --------------> {0}".format (predictionsAndLabels.take (1000)))

        #print (labelsAndPreds.collect ())
        print ("\nMETRICS:")
        try:
            print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0)))
            print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("precision          : {0}".format (metrics.precision(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("recall             : {0}".format (metrics.recall(1.0)))
        except:
            traceback.print_exc ()
        try:
            print ("fMeasure           : {0}".format (metrics.fMeasure(0.0, 2.0)))
        except:
            traceback.print_exc ()

        print ("confusion matrix   : {0}".format (metrics.confusionMatrix().toArray ()))
        print ("precision          : {0}".format (metrics.precision()))
        print ("recall             : {0}".format (metrics.recall()))
        print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate))
        print ("weighted precision : {0}".format (metrics.weightedPrecision))
        print ("weighted recall    : {0}".format (metrics.weightedRecall))
        print ("weight f measure   : {0}".format (metrics.weightedFMeasure()))
        print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0)))
        print ("")

        # Regression metrics
        predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) )

        regression_metrics = RegressionMetrics (predictedAndObserved)
        print ("explained variance......: {0}".format (regression_metrics.explainedVariance))
        print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError))
        print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError))
        print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError))
        print ("r2......................: {0}".format (regression_metrics.r2))
        print ("")

        labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features)))
        testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ())
        print ("Testing Error => {0}".format (testErr))
        print("Usage: logistic_regression", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonLogisticRegressionExample")
    sqlContext = SQLContext(sc)

    # Load the data stored in LIBSVM format as a DataFrame.
    df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

    # Map labels into an indexed column of labels in [0, numLabels)
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    [training, test] = td.randomSplit([0.7, 0.3])

    lr = LogisticRegression(maxIter=100, regParam=0.3).setLabelCol("indexedLabel")
    lr.setElasticNetParam(0.8)

    # Fit the model
    lrModel = lr.fit(training)

    predictionAndLabels = lrModel.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = MulticlassMetrics(predictionAndLabels)
    print("weighted f-measure %.3f" % metrics.weightedFMeasure())
    print("precision %s" % metrics.precision())
    print("recall %s" % metrics.recall())

    sc.stop()
#model = NaiveBayes.train(train,1.)#ナイベで

#print(len(np.array(model.weights)))

#テストデータの推定結果を取得
Re_l = res.collect()

#評価用に(推定ラベル,正解ラベル)のペアのタプルが並んだリストを生成
#hako = []
#for i,v in enumerate(test.collect()):
#    hako.append((float(Re_l[i]), v.label))
#
#そのリストをspark用に変換,メトリクスのクラスに渡す
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
##7_12変更
metrics = MulticlassMetrics(predictionAndLabels)

metrics2 = BinaryClassificationMetrics(predictionAndLabels)


#メトリクスのメソッドを使って再現率と適合率を計算&出力
print("-------------------------------------")
print("under PR = {}".format(metrics2.areaUnderPR))
#print("under ROC = {}".format(metrics2.areaUnderROC))

print("precision of ham = {}".format(metrics.precision(0.)))
print("precision of spam = {}".format(metrics.precision(1.)))
print("recall of ham = {}".format(metrics.recall(0.)))#スパムでないメールの認識率
print("recall of spam = {}".format(metrics.recall(1.)))#スパムメールの検出率

sc.stop()
# Evaluate model based on confusion matrix
from pyspark.mllib.evaluation import MulticlassMetrics

# model on training data regPara: lasso regularisation parameter (L1)
lrModel = LogisticRegression(regParam=0.2).fit(trainData)

# make prediction on test data
pred = lrModel.transform(testData)

pred.select('label', 'prediction').show()

evaluator1 = BinaryClassificationEvaluator(labelCol='label',
                                           metricName="areaUnderROC")
evaluator2 = MulticlassClassificationEvaluator(labelCol='label',
                                               metricName="f1")
metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple))

print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred))
print('F1 score of Logistic Regression model is %f' %
      evaluator2.evaluate(pred))
metrics.confusionMatrix().toArray().transpose()

# <a id="context322"></a>
# #### 3.2.2. Decision Tree

# In[14]:

from pyspark.ml.classification import DecisionTreeClassifier

# model on training data maxDepth is the hyperparameter
dtModel = DecisionTreeClassifier(maxDepth=3).fit(trainData)
    # $example on$
    # Load training data in LIBSVM format
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt")

    # Split data into training (60%) and test (40%)
    training, test = data.randomSplit([0.6, 0.4], seed=11)
    training.cache()

    # Run training algorithm to build the model
    model = LogisticRegressionWithLBFGS.train(training, numClasses=3)

    # Compute raw scores on the test set
    predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

    # Instantiate metrics object
    metrics = MulticlassMetrics(predictionAndLabels)

    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

    # Statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
Beispiel #45
0
trainLabelsAndPreds = trainParsed.map(
    lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count() / float(
    trainParsed.count())
print trainErr

# Test Error
testLabelsAndPreds = testParsed.map(
    lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count() / float(
    testParsed.count())
print testErr

metrics = BinaryClassificationMetrics(testLabelsAndPreds)

print metrics.areaUnderROC
print metrics.areaUnderPR

mcMetrics = MulticlassMetrics(testLabelsAndPreds)

#TODO: Do this for classes 1.0,0.0 and not just overall
print mcMetrics.precision()
print mcMetrics.recall()
print mcMetrics.fMeasure()

model.save(sc, "SVMModel")

### Run Model on Validation Set
## TODO: output file of zipcodes and predicted success metrics
## TODO: Use bokeh on file to make visualization of the US
Beispiel #46
0
# MAGIC %md
# MAGIC We can also generate a Confusion Matrix to see the results of the predictions better. ConfusionMatrix() works only with RDDs, so we will have to convert our DataFrame of (prediction, label) into a RDD.
# MAGIC 
# MAGIC confusionMatrix() returns a DenseMatrix with the columns representing the predicted class ordered by ascending class label, and each row represents the actual class ordered by ascending class label. The diagonal from top left to bottom right represents the observations that were predicted correctly. 
# MAGIC 
# MAGIC From the above confusion matrix, we observe that all Setosas (class 0) and Versicolors (class 1) have been classified correctly, but there are 10 Virginicas (class 2) that have been wrongly classified as Versicolors.

# COMMAND ----------

from pyspark.mllib.evaluation import MulticlassMetrics
# Create (prediction, label) pairs
predictionAndLabel = predictions.select("prediction", "label").rdd

# Generate confusion matrix
metrics = MulticlassMetrics(predictionAndLabel)
print metrics.confusionMatrix()


# COMMAND ----------

# MAGIC %md
# MAGIC ####Experimenting with Various Smoothing Parameters
# MAGIC 
# MAGIC We can experiment with various smoothing parameters to see which returns the best result. This is easily done with the ParamGridBuilder and CrossValidator.
# MAGIC 
# MAGIC As we indicate 6 values for the smoothing parameter, this grid will provide 6 parameter settings for CrossValidator to model, evaluate and choose from.

# COMMAND ----------

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
Beispiel #47
0
# coding=utf-8

from pyspark import SparkContext, SparkConf
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Multilabel Classification Evaluation').setMaster('local[2]')
sc = SparkContext(conf=conf)

scoreAndLabels = sc.parallelize([ # 此数据有误
    ([0.0, 1.0], [0.0, 2.0]),
    ([0.0, 2.0], [0.0, 1.0]),
    ([], [0.0]),
    ([2.0], [2.0]),
    ([2.0, 0.0], [2.0, 0.0]),
    ([0.0, 1.0, 2.0], [0.0, 1.0]),
    ([1.0], [1.0, 2.0])])

# instantiate metrics object
metrics = MulticlassMetrics(scoreAndLabels)

# summary stats
print('recall:', metrics.recall())
print('precision:', metrics.precision())
print('F1 measure:', metrics.fMeasure())
print('accuracy:', metrics.accuracy())

# individual label stats

sc.stop()
Beispiel #48
0
model = SVMWithSGD.train(trainParsed, iterations=100)

# Training Error
trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count())
print trainErr

# Test Error
testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count())
print testErr

metrics = BinaryClassificationMetrics(testLabelsAndPreds)

print metrics.areaUnderROC
print metrics.areaUnderPR

mcMetrics = MulticlassMetrics(testLabelsAndPreds)

#TODO: Do this for classes 1.0,0.0 and not just overall
print mcMetrics.precision()
print mcMetrics.recall()
print mcMetrics.fMeasure()

model.save(sc, "SVMModel")

### Run Model on Validation Set
## TODO: output file of zipcodes and predicted success metrics
## TODO: Use bokeh on file to make visualization of the US
Beispiel #49
0
testLabels = testnewsgroups.map(lambda x:newsgroupsMap[x])


testTf = testRDD.map(lambda (file,text): hashingTF.transform(tokenize(text)))

testTfIdf= idf.transform(testTf)

zippedTest = testLabels.zip(testTfIdf)

test = zippedTest.map(lambda (topic,vector): LabeledPoint(topic,vector))

predictionAndLabel = test.map(lambda x: (model.predict(x.features),x.label))

accuracy = 1.0*predictionAndLabel.filter(lambda x: x[0]==x[1]).count()/test.count()

metrics= MulticlassMetrics(predictionAndLabel)

print (accuracy)

print (metrics.weightedFMeasure())


#raw features

rawTokens = rdd.map(lambda (file,text): text.split(" "))

rawTF=rawTokens.map(lambda doc: hashingTF.transform(doc))


rawTrain=newsgroups.zip(rawTF).map(lambda (topic,vector): LabeledPoint(newsgroupsMap(topic),vector))
def logisticRegression(trainFile, testFile, taskid, sc):
	# Load training data in LIBSVM format
	trainData = MLUtils.loadLibSVMFile(sc, trainFile)
	testData = MLUtils.loadLibSVMFile(sc, testFile)

	# Split data into training (60%) and test (40%)
	# traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L)
	# traindata.cache()

	# Load testing data in LIBSVM format
	#testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath)

	labelNum = trainData.map(lambda lp: lp.label).distinct().count()

	# Run training algorithm to build the model
	model = LogisticRegressionWithLBFGS.train(trainData, numClasses=labelNum)

	# Compute raw scores on the test set
	predictionAndLabels = testData.map(lambda lp: (float(model.predict(lp.features)), lp.label))

	Json.generateJson("LogisticRegression", taskid, trainData, predictionAndLabels);
	# Instantiate metrics object
	metrics = MulticlassMetrics(predictionAndLabels)

	# Overall statistics
	precision = metrics.precision()
	recall = metrics.recall()
	f1Score = metrics.fMeasure()
	#confusion_matrix = metrics.confusionMatrix().toArray()

	print("Summary Stats")
	print("Precision = %s" % precision)
	print("Recall = %s" % recall)
	print("F1 Score = %s" % f1Score)


	# Statistics by class
	labels = trainData.map(lambda lp: lp.label).distinct().collect()
	for label in sorted(labels):
	    print("Class %s precision = %s" % (label, metrics.precision(label)))
	    print("Class %s recall = %s" % (label, metrics.recall(label)))
	    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

	# Weighted stats
	print("Weighted recall = %s" % metrics.weightedRecall)
	print("Weighted precision = %s" % metrics.weightedPrecision)
	print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
	print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
	print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

	# #return model parameters
	# res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)),
	# 	   ('3','Yes','Precision', metrics.precision(0.0)),
	# 	   ('4','Yes','Recall', metrics.recall(0.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)),
	#        ('3','Yes','Precision', metrics.precision(1.0)),
	# 	   ('4','Yes','Recall', metrics.recall(1.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)),
	#        ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)),
	# 	   ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)),
	#        ('3','Yes','Precision', metrics.precision(2.0)),
	#        ('4','Yes','Recall', metrics.recall(2.0)),
	#        ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))]

	# #save output file path as JSON and dump into dumpFilePath
	# rdd = sc.parallelize(res)
	# SQLContext.createDataFrame(rdd).collect()
	# df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value'])

	#tempDumpFilePath = dumpFilePath + "/part-00000"
	#if os.path.exists(tempDumpFilePath):
	#	os.remove(tempDumpFilePath)

	#df.toJSON().saveAsTextFile(hdfsFilePath)
	#tmpHdfsFilePath = hdfsFilePath + "/part-00000"
	#subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath])

	# Save and load model
	#clusters.save(sc, "myModel")
	#sameModel = KMeansModel.load(sc, "myModel")
    def evaluate(self, model=None, trainingData=None, testingData=None):
        """ Ham kiem thu model, in ra man hinh do do chinh xac va thoi gian tinh toan
        """
        time_train = 0
        time_test = 0

        if (not trainingData):
            trainingData = self.trainingData
        if (not testingData):
            testingData = self.testingData

        if (not model):
            # Train model
            print("Training...")
            start_train = datetime.now()
            model = self.trainModel(trainingData)
            time_train = datetime.now() - start_train

        #print("Num nodes: ", model.stages[2].totalNumNodes, "\n", model.stages[2].toDebugString, file=open("modelDebug.txt","w"))
        # Make predictions
        print("Testing...")
        start_test = datetime.now()
        predictions = model.transform(testingData)
        time_test = datetime.now() - start_test

        # Evaluation for flow
        print("{:*^100}".format(""))
        print("Training time: ", time_train)
        print("Testing time: ", time_test)

        featureImportances = {}
        fi = model.stages[2].featureImportances
        features = loadcols(self.dataset)
        index = 0
        for value in fi:
            featureImportances[features[index]] = value
            index = index + 1
        fiSorted = sorted(featureImportances.items(),
                          key=lambda x: x[1],
                          reverse=True)
        print("{:*^100}".format(" Feature Importances "))
        f = open("features_importance.txt", "w")
        for feature in fiSorted:
            if feature[1] > 0.000:
                print("{!s} : {:.4%}".format(feature[0].strip(), feature[1]))
            f.write("{!s}\n".format(feature[0].strip()))
        f.close()

        print("{:*^100}".format(" Evaluate for Flow "))

        print("Total predictions:", predictions.count())
        predictions.select("prediction", "indexedLabel",
                           "label").groupBy("label").count().show()

        predictionAndLabels = predictions.select("prediction",
                                                 "indexedLabel").rdd
        metrics = MulticlassMetrics(predictionAndLabels)

        print("Confusion Matrix:")
        for line in metrics.confusionMatrix().toArray():
            print(line)

        print("TPR: {:.3%} \tFPR: {:.3%}".format(
            metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0)))
        print("TNR: {:.3%} \tFNR: {:.3%}".format(
            metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0)))

        print("Precision: {:.3%} \tRecall: {:.3%} \tAccuracy: {:.3%}".format(
            metrics.precision(1.0), metrics.recall(1.0), metrics.accuracy))

        print(metrics.accuracy)

        print("{:*^100}".format(""))
Beispiel #52
0
def test_prfs():
    # TODO: revised so that it will take user's inputs instead of hardcoded values
    
    """
    Test Precision, Recall, Fscore, and Support on multiclass classification data
    Input data: https://github.com/apache/spark/blob/master/data/mllib/sample_multiclass_classification_data.txt.
    """

    # load the schemas (if existed)

    # create a hdfs directory
    #os.system("hdfs dfs -mkdir datasets")

    # load the data file into the hdfs directory
    os.system("hdfs dfs -put sample_multiclass_classification_data.txt datasets/sample_multiclass_classification_data.txt")
    data = MLUtils.loadLibSVMFile(scsingleton.sc, "hdfs://localhost:9000/datasets/sample_multiclass_classification_data.txt")
   
    # print data.take(1)
    # ie. [LabeledPoint(1.0, (4,[0,1,2,3],[-0.222222,0.5,-0.762712,-0.833333]))] 
    # [ ( finalClassification, (numLabels, [label0, label1, label2, ..., labelN], [prob0, prob1, prob2, ..., probN]) ) ]

    # split data into train (60%), test (40%)
    trainingRDD, testRDD = data.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    with Timer() as t:
        numTest = testRDD.count()
    print "testRDD.count(): %s seconds" % t.secs

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3)
    print "LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredAndLabel = testRDD.map(lambda lp: (float(model.predict(lp.features)), lp.label))
    print "testPredAndLabel: %s seconds" % t.secs

    # calculate Precision, Recall, F1-score
    metrics = MulticlassMetrics(testPredAndLabel)
    print( "precision = %s" % metrics.precision() )
    print( "recall = %s" % metrics.recall() )
    print( "f1-score = %s" % metrics.fMeasure() )

    # statistics by class
    labels = data.map(lambda lp: lp.label).distinct().collect()
    for label in sorted(labels):
        print( "Class %s precision = %s" % (label, metrics.precision(label)) )
        print( "Class %s recall = %s" % (label, metrics.recall(label)) )
        print( "Class %s f1-score = %s" % (label, metrics.fMeasure(label, beta=1.0)) )

    # weighted stats
    print( "Weighted precision = %s" % metrics.weightedPrecision )
    print( "Weighted recall = %s" % metrics.weightedRecall )
    print( "Weighted f1-score = %s" % metrics.weightedFMeasure() )
    print( "Weighted f(0.5)-score = %s" % metrics.weightedFMeasure(beta=0.5) )
    print( "Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate )
    
    return