def runMetrics(labeledDataRDD, *args): html='<table width=100%><tr><th>Model</th><th>Accuracy</th><th>Precision</th><th>Recall</th></tr>' confusionHtml = '<p>Confusion Tables for each Model</p>' for model in args: label= model.__class__.__name__ predictionAndLabels = model.predict(labeledDataRDD.map(lambda lp: lp.features)) metrics = MulticlassMetrics(\ predictionAndLabels.zip(labeledDataRDD.map(lambda lp: lp.label)).map(lambda t: (float(t[0]),float(t[1])))\ ) html+='<tr><td>{0}</td><td>{1:.2f}%</td><td>{2:.2f}%</td><td>{3:.2f}%</td></tr>'\ .format(label,metrics.weightedFMeasure(beta=1.0)*100, metrics.weightedPrecision*100,metrics.weightedRecall*100 ) if ( displayConfusionTable ): confusionMatrix = metrics.call("confusionMatrix") confusionMatrixArray = confusionMatrix.toArray() #labels = metrics.call("labels") confusionHtml += "<p>" + label + "<p>" confusionHtml += "<table>" for row in confusionMatrixArray: confusionHtml += "<tr>" for cell in row: confusionHtml+="<td>" + str(cell) + "</td>" confusionHtml += "</tr>" confusionHtml += "</table>" html+='</table>' if ( displayConfusionTable ): html+=confusionHtml display(HTML(html))
def evaluate(labelsAndPredictions, data, labels): """ Evaluation Metrics """ # Instantiate metrics object metrics = MulticlassMetrics(labelsAndPredictions) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) # Statistics by class for label in sorted(labels): print("Class %s precision = %s" % (label, metrics.precision(label))) print("Class %s recall = %s" % (label, metrics.recall(label))) print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) # Weighted stats print("Weighted recall = %s" % metrics.weightedRecall) print("Weighted precision = %s" % metrics.weightedPrecision) print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
def evaluate(predictions): """ Evaluation Metrics """ # label to indexedLabel mappings # out = sorted(set([(i[0], i[1]) for i in predictions.select(predictions.label, predictions.indexedLabel).collect()]), key=lambda x: x[0]) print "Predictions" predictions.select("prediction", "indexedLabel", "features").show(5) # Select (prediction, true label) and evaluate model predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd metrics = MulticlassMetrics(predictionAndLabels) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) # Statistics by class labels = predictions.map(lambda lp: lp.label).distinct().collect() for label in sorted(labels): print("Class %s precision = %s" % (label, metrics.precision(label))) print("Class %s recall = %s" % (label, metrics.recall(label))) print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) # Weighted stats print("Weighted recall = %s" % metrics.weightedRecall) print("Weighted precision = %s" % metrics.weightedPrecision) print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) treeModel = model.stages[2] print treeModel # summary only
def testClassification(train, test): # Train a RandomForest model. # Setting featureSubsetStrategy="auto" lets the algorithm choose. # Note: Use larger numTrees in practice. rf = RandomForestClassifier(labelCol="indexedLabel", numTrees=3, maxDepth=4) model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = MulticlassMetrics(predictionAndLabels) print("weighted f-measure %.3f" % metrics.weightedFMeasure()) print("precision %s" % metrics.precision()) print("recall %s" % metrics.recall())
def doRender(self, handlerId): html='<div class="pd_save"><table width=100%><tr><th>Model</th><th>Accuracy</th><th>Precision</th><th>Recall</th></tr>' confusionHtml = '<p>Confusion Tables for each Model</p>' for modelName,model in Configuration.getModels(): label= model.__class__.__name__ labeledDataRDD, sqlTableName = Configuration.getLabeledData(self.entity) predictionAndLabels = model.predict(labeledDataRDD.map(lambda lp: lp.features)) metrics = MulticlassMetrics(\ predictionAndLabels.zip(labeledDataRDD.map(lambda lp: lp.label)).map(lambda t: (float(t[0]),float(t[1])))\ ) html+='<tr><td>{0}</td><td>{1:.2f}%</td><td>{2:.2f}%</td><td>{3:.2f}%</td></tr>'\ .format(label,metrics.weightedFMeasure(beta=1.0)*100, metrics.weightedPrecision*100,metrics.weightedRecall*100 ) displayConfusionTable = True if ( displayConfusionTable ): #get labels from RDD handler=training.getTrainingHandler() classLabels = labeledDataRDD.map(lambda t: t.label).distinct().map(lambda l: handler.getClassLabel(l)).collect() confusionMatrix = metrics.call("confusionMatrix") confusionMatrixArray = confusionMatrix.toArray() #labels = metrics.call("labels") confusionHtml += "<p>" + label + "<p>" confusionHtml += "<table>" confusionHtml+="<tr><td></td>" for classLabel in classLabels: confusionHtml+="<td>" + str(classLabel) + "</td>" confusionHtml+="</tr>" for i, row in enumerate(confusionMatrixArray): confusionHtml += "<tr>" confusionHtml += "<td>" + classLabels[i] + "</td>" for j, cell in enumerate(row): confusionHtml+="<td style='text-align:center'>" + ("<b>" if (i==j) else "") + str(cell) + ("</b>" if (i==j) else "") + "</td>" confusionHtml += "</tr>" confusionHtml += "</table>" html+='</table></div>' if ( displayConfusionTable ): html+=confusionHtml self._addHTML(html)
def printMetrics(pred_and_label): metrics = MulticlassMetrics(pred_and_label) print 'Preicision of 0', metrics.precision(0) print 'Preicision of 1', metrics.precision(1) print 'Preicision of 2', metrics.precision(2) print 'Preicision of 3', metrics.precision(3) print 'Recall of 0', metrics.recall(0) print 'Recall of 1', metrics.recall(1) print 'Recall of 2', metrics.recall(2) print 'Recall of 3', metrics.recall(3) print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()
def train(model, model_name): pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx,model]) (trainingData, testData) = df.randomSplit([0.7, 0.3], seed = 100) pipelineFit = pipeline.fit(trainingData) predictions = pipelineFit.transform(testData) evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"}) predictions.show(20) path= model_name+".model" pipelineFit.write().overwrite().save(path) print(model_name+": ",str(accuracy)) predictionAndLabels = predictions.select("prediction", "label").rdd.map(lambda r : (r[0], r[1])) metrics = MulticlassMetrics(predictionAndLabels) precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Summary:") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score)
def printMetrics(predictions_and_labels): metrics = MulticlassMetrics(predictions_and_labels) print 'Precision of True ', metrics.precision(1) print 'Precision of False', metrics.precision(0) print 'Recall of True ', metrics.recall(1) print 'Recall of False ', metrics.recall(0) print 'F-1 Score ', metrics.fMeasure() print 'Confusion Matrix\n', metrics.confusionMatrix().toArray()
def print_performance_metrics(predictions): # Evaluate model evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}) aupr = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderPR"}) print("auc = {}".format(auc)) print("aupr = {}".format(aupr)) # get rdd of predictions and labels for mllib eval metrics predictionAndLabels = predictions.select("prediction", "label").rdd # Instantiate metrics objects binary_metrics = BinaryClassificationMetrics(predictionAndLabels) multi_metrics = MulticlassMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = {}".format(binary_metrics.areaUnderPR)) # Area under ROC curve print("Area under ROC = {}".format(binary_metrics.areaUnderROC)) # Accuracy print("Accuracy = {}".format(multi_metrics.accuracy)) # Confusion Matrix print(multi_metrics.confusionMatrix()) ### Question 5.1 Answer ### # F1 print("F1 = {}".format(multi_metrics.weightedFMeasure())) # Precision print("Precision = {}".format(multi_metrics.weightedPrecision)) # Recall print("Recall = {}".format(multi_metrics.weightedRecall)) # FPR print("FPR = {}".format(multi_metrics.weightedFalsePositiveRate)) # TPR print("TPR = {}".format(multi_metrics.weightedTruePositiveRate))
def get_metrics(df, lower_bound, upper_bound=1.0): rdd = df.select("prediction", "Profit").rdd metrics = MulticlassMetrics(rdd) metrics_dict = {} cm = metrics.confusionMatrix().toArray() TP = cm[0][0] TN = cm[1][1] FP = cm[0][1] FN = cm[1][0] accuracy = (TP + TN) / cm.sum() if accuracy < lower_bound or accuracy > upper_bound: return None sensitivity = (TP) / (TP + FN) specificity = (TN) / (TN + FP) precision = (TP) / (TP + FP) npv = (TN) / (TN + FN) # Overall statistics metrics_dict['accuracy'] = accuracy metrics_dict['sensitivity'] = sensitivity metrics_dict['specificity'] = specificity metrics_dict['precision'] = precision metrics_dict['npv'] = npv # print("Summary Stats") # print(metrics.confusionMatrix()) metrics_dict['confusionMatrix'] = metrics.confusionMatrix() print( "{},{},{},{},{}".format(round(accuracy, 3), round(sensitivity, 3), round(specificity, 3), round(precision, 3), round(npv, 3))) return metrics_dict
def get_results(train, test): results = [] for i in range(len(train)): training = train[i].filter(lambda x: not np.isnan(x.features[0]) ).filter(lambda x: x.features[0] > 0.0) testing = test[i].filter(lambda x: not np.isnan(x.features[0])).filter( lambda x: x.features[0] > 0.0) model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, numTrees=20, featureSubsetStrategy="auto", impurity='gini', maxDepth=10, maxBins=32) test_preds = (testing.map(lambda x: x.label).zip( model.predict(testing.map(lambda x: x.features)))) test_metrics = MulticlassMetrics( test_preds.map(lambda x: (x[0], float(x[1])))) answer = str(test_metrics.precision()) + '\n' + str( test_metrics.confusionMatrix().toArray()) + '\n' results.append(answer) return sc.parallelize(results)
def statistics(test_data, prediction_data): # Compute raw scores on the test set prediction_and_labels = test_data.join(prediction_data, 'test_id').rdd.map( lambda x: (float(x.prediction[0]), float(x.test_labels[0]))) # Instantiate metrics object metrics = MulticlassMetrics(prediction_and_labels) # Overall statistics print("Summary Statistics\n") summary_statistics(metrics) # Statistics by class print("\nClass Summary Statistics\n") label_statistics(metrics, labels)
def evaluate_model_simple(self, test): ''' generate tpr, fpr, fnr, and tpr for each threshold -------- Parameters: test: spark.df post vectorization number_of_iterations: number of threshold values between .001 and 1.00 utilized in roc curve -------- Returns: list-of-dict - containing rate of pthres, tp, fp, fn, tn ''' score_model = {} predictionAndLabels = test.rdd.map( lambda lp: (float(self.model.predict(lp.features)), lp.label)) # Instantiate metrics object metrics = BinaryClassificationMetrics(predictionAndLabels) metrics2 = MulticlassMetrics(predictionAndLabels) # Area under precision-recall curve score_model['precision_recall'] = metrics.areaUnderPR # Area under ROC curve score_model["ROC_area"] = metrics.areaUnderROC score_model['tpr'] = metrics2.truePositiveRate('label') score_model['fpr'] = metrics2.falsePositiveRate('label') return score_model
def print_f1(df, total_columns, classifier='gbt'): label_column = total_columns[-1] if classifier == 'gbt': predictionAndLabels = df.select( ['prediction_with_round', total_columns[-1]]) else: predictionAndLabels = df.select(['indexedLabel', "prediction"]) labels = df.select([label_column]).distinct() header = labels.rdd.first() labels = labels.rdd.filter(lambda line: line != header) header = predictionAndLabels.rdd.first() copy_predictionAndLabels = predictionAndLabels.rdd.filter( lambda line: line != header) copy_predictionAndLabel = copy_predictionAndLabels.map( lambda lp: (float(lp[0]), float(lp[1]))) metrics = MulticlassMetrics(copy_predictionAndLabel) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score)
def evaluate(sc, model, labeled_points): labeled_points = labeled_points.union(sc.parallelize([])) labels = labeled_points.map(lambda lp: lp.label) predictions = model.predict(labeled_points.map(lambda lp: lp.features)) predictions, labels = predictions.union(sc.parallelize([])), labels.union( sc.parallelize([])) predictionAndLabels = predictions.zip(labels) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # Overall statistics cm = metrics.confusionMatrix().toArray() recall = cm[1][1] / (cm[1][0] + cm[1][1]) precision = cm[1][1] / (cm[0][1] + cm[1][1]) f1Score = 2. * (precision * recall) / (precision + recall) print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) print("Confusion Matrix = ") print(" 0 1") print("0 {0} {1}".format(cm[0][0], cm[1][0])) print("1 {0} {1}".format(cm[0][1], cm[1][1]))
def evaluate(predictionAndLabels): log = {} # Show Validation Score (AUROC) evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC') log['AUROC'] = "%f" % evaluator.evaluate(predictionAndLabels) print("Area under ROC = {}".format(log['AUROC'])) # Show Validation Score (AUPR) evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR') log['AUPR'] = "%f" % evaluator.evaluate(predictionAndLabels) print("Area under PR = {}".format(log['AUPR'])) # Metrics predictionRDD = predictionAndLabels.select(['label', 'prediction']) \ .rdd.map(lambda line: (line[1], line[0])) metrics = MulticlassMetrics(predictionRDD) # Confusion Matrix print(metrics.confusionMatrix().toArray()) # Overall statistics log['precision'] = "%s" % metrics.precision() log['recall'] = "%s" % metrics.recall() log['F1 Measure'] = "%s" % metrics.fMeasure() print("[Overall]\tprecision = %s | recall = %s | F1 Measure = %s" % \ (log['precision'], log['recall'], log['F1 Measure'])) # Statistics by class labels = [0.0, 1.0] for label in sorted(labels): log[label] = {} log[label]['precision'] = "%s" % metrics.precision(label) log[label]['recall'] = "%s" % metrics.recall(label) log[label]['F1 Measure'] = "%s" % metrics.fMeasure(label, beta=0.5) print("[Class %s]\tprecision = %s | recall = %s | F1 Measure = %s" \ % (label, log[label]['precision'], log[label]['recall'], log[label]['F1 Measure'])) return log
def validate_tffm(spark, sc, model, test_df, s3_metrics_path, s3_endpoint_path): # get predictions validation_df = model.transform(test_df) metricsSchema = StructType() \ .add("metric", StringType()) \ .add("value", DoubleType()) metrics_names = [] # apply threshold def thresholdScore(x): retval = 0.0 if x > 0.5: retval = 1.0 return retval thresholdScoreUdf = F.UserDefinedFunction(thresholdScore, T.FloatType()) validation_df_round = validation_df.withColumn('rscore', thresholdScoreUdf(validation_df.score)) predTffm = validation_df_round.select(['label','rscore']) predictionAndLabelsTffm = predTffm.rdd.map(lambda lp: (lp.rscore, lp.label)) metricsTffm = BinaryClassificationMetrics(predictionAndLabelsTffm) metrics_names.append(("Area_under_PR",metricsTffm.areaUnderPR)) metrics_names.append(("Area_under_ROC",metricsTffm.areaUnderROC)) mmetricsTffm = MulticlassMetrics(predictionAndLabelsTffm) metrics_names.append(("Precision",mmetricsTffm.precision())) metrics_names.append(("Recall",mmetricsTffm.recall())) metrics_names.append(("F1",mmetricsTffm.fMeasure())) metrics_names.append(("Weighted_recall",mmetricsTffm.weightedRecall)) metrics_names.append(("Weighted_precision",mmetricsTffm.weightedPrecision)) metrics_names.append(("Weighted_F1",mmetricsTffm.weightedFMeasure())) metrics_names.append(("Weighted_F05",mmetricsTffm.weightedFMeasure(beta=0.5))) metrics_names.append(("Weighted_FP_rate",mmetricsTffm.weightedFalsePositiveRate)) mRdd = sc.parallelize(metrics_names).coalesce(1) dfMetrics = spark.createDataFrame(mRdd, metricsSchema) dfMetrics.write.csv("{0}/{1}".format(s3_metrics_path, model.endpointName), mode="overwrite") endpointSchema = StructType() \ .add("time", StringType()) \ .add("endpoint", StringType()) endpoint_name = [] endpoint_name.append((str(time.time()),str(model.endpointName))) eRdd = sc.parallelize(endpoint_name).coalesce(1) dfEndpoint = spark.createDataFrame(eRdd, endpointSchema) dfEndpoint.write.csv("{0}/endpoint.txt".format(s3_endpoint_path), mode="overwrite")
def test_spark_ml_model_classification(spark_context, classification_model, mnist_data): batch_size = 64 nb_classes = 10 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) sgd_conf = optimizers.serialize(sgd) # Initialize Spark ML Estimator estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(nb_classes) # Fitting a model returns a Transformer pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) # since prediction in a multiclass classification problem is a vector, we need to compute argmax # the casting to a double is just necessary for using MulticlassMetrics pnl = pnl.select( 'label', argmax('prediction').astype(DoubleType()).alias('prediction')) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.accuracy)
def printStatistics(labelsAndPredictions, data): metrics = MulticlassMetrics(labelsAndPredictions) labels = data.map(lambda lp: lp.label).distinct().collect() print("confusion metrics:") cm = metrics.confusionMatrix() print(cm) print('') print('accuracy: ' + str(metrics.accuracy)) for label in labels: print('label: ' + str(label)) print('fp: ' + str(metrics.falsePositiveRate(label))) print('tp: ' + str(metrics.truePositiveRate(label))) recall = metrics.recall() precision = metrics.precision() print("Recall = %s" % recall) print("Precision = %s" % precision)
def evaluateClassification(self, predictionAndLabels): metrics = MulticlassMetrics(predictionAndLabels) cm = metrics.confusionMatrix() result = {} result['Matrix'] = cm.toArray().tolist() result['Precision'] = metrics.precision() result['Recall'] = metrics.recall() result['F1 Score'] = metrics.fMeasure() return result
def performancerdd(self): self.calculator = 'RDDs' print('Calculating performance metrics using RDDs...') predictionRDD = self.predictions.select(['label','prediction']).rdd.map(lambda line: (line[1],line[0])) binmetrics = BinaryClassificationMetrics(predictionRDD) metrics = MulticlassMetrics(predictionRDD) self.areaUnderROC = binmetrics.areaUnderROC self.areaUnderPR = binmetrics.areaUnderPR self.confusionMatrix = metrics.confusionMatrix().toArray() self.accuracy = metrics.accuracy self.precision = metrics.precision() self.recall = metrics.recall() self.f1measure = metrics.fMeasure() self.falsePositive = metrics.falsePositiveRate(1.0) self.falseNegative = metrics.falsePositiveRate(0.0)
def generateJson(AlgorithmName, taskid, traindata, predictionAndLabels): jsonContent = dict() jsonContent['AlgorithmName'] = AlgorithmName jsonContent['TaskId'] = taskid labels = traindata.map(lambda lp: lp.label).distinct().collect() jsonContent['LabelNum'] = len(labels) metrics = MulticlassMetrics(predictionAndLabels) precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() confusion_matrix = metrics.confusionMatrix().toArray() jsonContent['Precision'] = precision jsonContent['Recall'] = recall jsonContent['F1Score'] = f1Score jsonContent['ConfusionMatrix'] = confusion_matrix.tolist() jsonContent['Labels'] = list() for label in sorted(labels): tempList = dict() tempList['Precision'] = metrics.precision(label) tempList['Recall'] = metrics.recall(label) tempList['F1Measure'] = metrics.fMeasure(label, beta=1.0) jsonContent['Labels'].append(tempList) jsonContent['WeightedStats'] = dict() jsonContent['WeightedStats']['Precision'] = metrics.weightedRecall jsonContent['WeightedStats']['F1Score'] = metrics.weightedFMeasure() jsonContent['WeightedStats']['FalsePositiveRate'] = metrics.weightedFalsePositiveRate with open(taskid + '.json', 'w') as jsonFile: json.dump(jsonContent, jsonFile, indent=4, separators=(',', ': ')) jsonFile.flush()
def amazon_classification(sc, filename): ''' Args: sc: The Spark Context filename: Filename of the Amazon reviews file to use, where each line represents a review ''' # Load in reviews reviews = sc.textFile(filename).sample(False, 0.001) # Parse to csv csv_loads = reviews.map(loadcsv) # labeled_data = (csv_loads.filter(lambda x: x != None).mapValues(lambda x: x.split())) labels = labeled_data.keys() tf = HashingTF().transform(labeled_data.map(lambda x:x[1])) idf = IDF(minDocFreq=7).fit(tf) tfidf = idf.transform(tf) labeled_points = (labels.zip(tfidf) .map(lambda x: LabeledPoint(float(x[0]), x[1]))) training, test = labeled_points.randomSplit([0.6, 0.4]) model = NaiveBayes.train(training) # Use our model to predict train_preds = (training.map(lambda x: x.label) .zip(model.predict(training.map(lambda x: x.features)))) test_preds = (test.map(lambda x: x.label) .zip(model.predict(test.map(lambda x: x.features)))) # Ask PySpark for some metrics on how our model predictions performed trained_metrics = MulticlassMetrics(train_preds.map(lambda x: (x[0], float(x[1])))) test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1])))) ojbk = open('./xxx.txt','w+') ojbk.write(str(trained_metrics.confusionMatrix().toArray()) + '\n') ojbk.write(str(trained_metrics.precision()) + '\n') ojbk.write(str(test_metrics.confusionMatrix().toArray()) + '\n') objk.write(str(test_metrics.precision()) + '\n')
def predict(self): #print self.predictingData.show() predictions = self.model.transform(self.predictingData) #print predictions.show() #df= predictions.select('prediction').collect() #return df[0].asDict()["prediction"] predictions.select("URL", "prediction", "indexedLabel", "label").show(200) predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd metrics = MulticlassMetrics(predictionAndLabels) print("TPR: {:.3%} \tFPR: {:.3%}".format( metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0))) print("TNR: {:.3%} \tFNR: {:.3%}".format( metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0))) print("Confusion Matrix:") for line in metrics.confusionMatrix().toArray(): print(line)
def modelStatistics(labelsAndPredictions): metrics = MulticlassMetrics(labelsAndPredictions) print(metrics.confusionMatrix()) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) # Weighted stats print("Weighted recall = %s" % metrics.weightedRecall) print("Weighted precision = %s" % metrics.weightedPrecision) print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
def pred_precision_kaggle(prediction,NumCluster): pred_label = prediction.rdd.map(lambda x: (float(np.argsort(-1*x.prediction)[:1]), float((x.country_destination_indexed)))) metrics = MulticlassMetrics(pred_label) avg_precision = metrics.precision() for i in range(1,NumCluster): pred_label = prediction.rdd.map(lambda x: (float(np.argsort(-1*x.probability)[i:(i+1)]), float(x.country_destination_indexed))) metrics = MulticlassMetrics(pred_label) avg_precision += metrics.precision() return avg_precision
def performance(predictions): predictionRDD = predictions.select(['label', 'prediction']).rdd.map(lambda line: (line[1], line[0])) binmetrics = BinaryClassificationMetrics(predictionRDD) metrics = MulticlassMetrics(predictionRDD) results = {'predictions':predictions, 'areaUnderROC':binmetrics.areaUnderROC, 'areaUnderPR':binmetrics.areaUnderPR, 'confusionMatrix':metrics.confusionMatrix().toArray(), 'accuracy':metrics.accuracy, 'precision':metrics.precision(), 'recall':metrics.recall(), 'f1measure':metrics.fMeasure()} return results
def test_functional_model(spark_context, classification_model_functional, mnist_data): batch_size = 64 epochs = 1 x_train, y_train, x_test, y_test = mnist_data x_train = x_train[:1000] y_train = y_train[:1000] df = to_data_frame(spark_context, x_train, y_train, categorical=True) test_df = to_data_frame(spark_context, x_test, y_test, categorical=True) sgd = optimizers.SGD() sgd_conf = optimizers.serialize(sgd) estimator = ElephasEstimator() estimator.set_keras_model_config(classification_model_functional.to_yaml()) estimator.set_optimizer_config(sgd_conf) estimator.set_mode("synchronous") estimator.set_loss("categorical_crossentropy") estimator.set_metrics(['acc']) estimator.set_epochs(epochs) estimator.set_batch_size(batch_size) estimator.set_validation_split(0.1) estimator.set_categorical_labels(True) estimator.set_nb_classes(10) pipeline = Pipeline(stages=[estimator]) fitted_pipeline = pipeline.fit(df) prediction = fitted_pipeline.transform(test_df) pnl = prediction.select("label", "prediction") pnl = pnl.select( 'label', argmax('prediction').astype(DoubleType()).alias('prediction')) pnl.show(100) prediction_and_label = pnl.rdd.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.accuracy)
def eval_model(test_preds, model): """ Evaluate the ml model given the predictions and test data Args: test_preds - a list of transformed prediction data model - the ml pipelined model Returns: A confusion matrix, along with the precision, recall and F1 score of the currently trained model """ metrics = MulticlassMetrics(test_preds.select("prediction", "label").rdd) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Confusion matrix") print(metrics.confusionMatrix()) print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score)
lambda x: x[0] and x[1]).map(lambda x: (float(x[0]), x[1])).mapValues( nltk.word_tokenize)) labels = labeled_data.map(lambda x: x[0]) tfidf = produce_tfidf(labeled_data.map(lambda x: x[1])) zipped_data = ( labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1])).cache()) # Do a random split so we can test our model on non-trained data training, test = zipped_data.randomSplit([0.7, 0.3]) # Train our model model = NaiveBayes.train(training) # Use our model to predict train_preds = (training.map(lambda x: x.label).zip( model.predict(training.map(lambda x: x.features)))) test_preds = (test.map(lambda x: x.label).zip( model.predict(test.map(lambda x: x.features)))) # Ask PySpark for some metrics on how our model predictions performed trained_metrics = MulticlassMetrics( train_preds.map(lambda x: (x[0], float(x[1])))) test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1])))) with open('output_discrete.txt', 'w+') as f: f.write(str(trained_metrics.confusionMatrix().toArray()) + '\n') f.write(str(trained_metrics.precision()) + '\n') f.write(str(test_metrics.confusionMatrix().toArray()) + '\n') f.write(str(test_metrics.precision()) + '\n')
print("Data prepared.\n") # create models for one-vs-rest SVM binary classifiers print("Preparing models\n") models = [model_per_class(i, labelled_training_data) for i in range(1, 7)] print("Models prepared.\n") # make predictions for testing data print("Making predictions.\n") predictions = labelled_testing_data.map( lambda x: (float(np.argmax([model.predict(x.features) for model in models]) + 1), x.label)) print("Predictions completed.\n") # calculate precision, recall, and f-measure print("Calculating evaluation metrics for feature set 1.\n") metrics = MulticlassMetrics(predictions) print("F-Measure: ", metrics.fMeasure()) print("Confusion matrix\n\n") plot.plot_confusion_matrix(metrics.confusionMatrix().toArray(), "cm1_refactored.png") for i in range(1, 7): print("Precision for ", i, " is ", metrics.precision(i)) print("Recall for ", i, " is ", metrics.recall(i)) print("f-measure for ", i, " is ", metrics.fMeasure(float(i)), "\n") precision.append(metrics.precision(i)) recall.append(metrics.recall(i)) fmeasure.append(metrics.fMeasure(float(i))) plot.plot_per_activity_metric(precision, recall, fmeasure, "fs1_refactored.png") precision = [] recall = []
# # evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") # # # Create 7-fold CrossValidator # cv = CrossValidator(estimator=lr, \ # estimatorParamMaps=paramGrid, \ # evaluator=evaluator, \ # numFolds=5) lrModel = lr.fit(train) # cvModel = cv.fit(train) # rfModel = rf.fit(train) predictions = lrModel.transform(test) # predictions = cvModel.transform(test) # predictions = rfModel.transform(test) results = predictions.select(['prediction', 'label']) predictionAndLabels = results.rdd metrics = MulticlassMetrics(predictionAndLabels) cm = metrics.confusionMatrix().toArray() accuracy = (cm[0][0] + cm[1][1]) / cm.sum() precision = (cm[0][0]) / (cm[0][0] + cm[1][0]) recall = (cm[0][0]) / (cm[0][0] + cm[0][1]) f1score = 2 * ((precision * recall) / (precision + recall)) # print(evaluator.evaluate(predictions)) print("Classifier: accuracy, precision, recall, f1score", accuracy, precision, recall, f1score)
trainer = ADAG(keras_model=model, worker_optimizer='adam', loss='categorical_crossentropy', num_workers=1, batch_size=100, communication_window=5, num_epoch=50, features_col="matrix", label_col="label_encoded" ) trained_model = trainer.train(training_set) from distkeras.predictors import * from distkeras.transformers import * from distkeras.evaluators import * from distkeras.utils import * print("Training time: " + str(trainer.get_training_time())) print("Accuracy: " + str(evaluate_accuracy(trained_model, test_set))) print("Number of parameter server updates: " + str(trainer.parameter_server.num_updates)) from pyspark.ml import Pipeline pipeline = Pipeline(stages=[string_indexer, scaler, trainer_model]) from pyspark.mllib.evaluation import MulticlassMetrics fitted_pipeline = pipeline.fit(dataset_train) # Fit model to data prediction = fitted_pipeline.transform(dataset_train) # Evaluate on train data. # prediction = fitted_pipeline.transform(test_df) # <-- The same code evaluates test data. pnl = prediction.select("index_category", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.index_category, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision())
data = sc.textFile('/usr/sfd_train_one_hot.csv').map(parseLine) # data = sc.textFile('/usr/sfd_0.csv').map(parseLine) # Split data into training (60%) and test (40%) training, test = data.randomSplit([0.85, 0.15], seed=11L) training.cache() # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(training, numClasses=39) # Compute raw scores on the test set predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() #accuracy = metrics.accuracy accuracy = 1.0 * predictionAndLabels.filter(lambda (x, v): x == v).count() / test.count() # print("Summary Stats") # print("Precision = %s" % precision) # print("Recall = %s" % recall) # print("F1 Score = %s" % f1Score) # print("Accuracy = %s" % accuracy) # Statistics by class labels = data.map(lambda lp: lp.label).distinct().collect() # for label in sorted(labels):
indexed2.show() # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=indexed2.columns[1:], outputCol="features") data = assembler.transform(indexed2) # Split data into training and test data set training, test = data.select("label", "features").randomSplit([0.6, 0.4]) # Create Decision tree model and fit the model with training dataset dt = DecisionTreeClassifier() model = dt.fit(training) # Generate prediction from test dataset predictions = model.transform(test) # Evuluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions) # Show model accuracy print("Accuracy:", accuracy) # Report predictionAndLabels = predictions.select("label", "prediction").rdd metrics = MulticlassMetrics(predictionAndLabels) print("Confusion Matrix:", metrics.confusionMatrix()) print("Precision:", metrics.precision()) print("Recall:", metrics.recall()) print("F-measure:", metrics.fMeasure())
df = to_data_frame(sc, x_train, y_train, categorical=True) test_df = to_data_frame(sc, x_test, y_test, categorical=True) # Initialize Spark ML Estimator adadelta = elephas_optimizers.Adadelta() estimator = ElephasEstimator(sc, model, nb_epoch=nb_epoch, batch_size=batch_size, optimizer=adadelta, frequency='batch', mode='asynchronous', num_workers=2, verbose=0, validation_split=0.1, categorical=True, nb_classes=nb_classes) # Fitting a model returns a Transformer fitted_model = estimator.fit(df) # Evaluate Spark model by evaluating the underlying model prediction = fitted_model.transform(test_df) pnl = prediction.select("label", "prediction") pnl.show(100) prediction_and_label = pnl.map(lambda row: (row.label, row.prediction)) metrics = MulticlassMetrics(prediction_and_label) print(metrics.precision()) print(metrics.recall())
pendtsets = pendtlpoints.randomSplit([0.8, 0.2]) pendttrain = pendtsets[0].cache() pendtvalid = pendtsets[1].cache() from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(maxDepth=20) dtmodel = dt.fit(pendttrain) # rootNode is not accessible in Python dtpredicts = dtmodel.transform(pendtvalid) dtresrdd = dtpredicts.select("prediction", "label").map(lambda row: (row.prediction, row.label)) from pyspark.mllib.evaluation import MulticlassMetrics dtmm = MulticlassMetrics(dtresrdd) dtmm.precision() #0.951442968392121 print(dtmm.confusionMatrix()) #DenseMatrix([[ 205., 0., 3., 0., 0., 3., 1., 0., 0., # 0.], # [ 0., 213., 0., 1., 2., 1., 0., 2., 0., # 2.], # [ 0., 0., 208., 0., 0., 2., 0., 1., 1., # 0.], # [ 0., 1., 0., 172., 3., 0., 0., 0., 0., # 0.], # [ 2., 2., 1., 8., 197., 0., 0., 2., 3., # 1.], # [ 1., 0., 1., 0., 2., 183., 0., 1., 0., # 1.],
# Split data into training (60%) and test (40%) traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L) traindata.cache() # Load testing data in LIBSVM format #testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath) # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3) # Compute raw scores on the test set predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label)) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() #confusion_matrix = metrics.confusionMatrix().toArray() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) # Statistics by class labels = traindata.map(lambda lp: lp.label).distinct().collect()
#学習した後の判別用 res = test.map(lambda data: model.predict(data.features)) #モデルを学習 #model = LogisticRegressionWithLBFGS.train(train,10)#ロジスティック回帰(L-BFGS)で #model = SVMWithSGD.train(train,5000)#SVMで model = NaiveBayes.train(train, 10.0) #print(len(np.array(model.weights))) #テストデータの推定結果を取得 Re_l = res.collect() #評価用に(推定ラベル,正解ラベル)のペアのタプルが並んだリストを生成 hako = [] for i,v in enumerate(test.collect()): hako.append((float(Re_l[i]), v.label)) #そのリストをspark用に変換,メトリクスのクラスに渡す predictionAndLabels = sc.parallelize(hako) metrics = MulticlassMetrics(predictionAndLabels) #メトリクスのメソッドを使って再現率と適合率を計算&出力 print("-------------------------------------") print("recall = {}".format(metrics.recall(0.)))#スパムメールの検出率 print("precision = {}".format(metrics.precision(0.)))#スパムでないメールの認識率 print("-------------------------------------") sc.stop()
def train_model (conf): sc = SparkUtil.get_spark_context (conf.spark_conf) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}".format (conf.output_dir) labeled = Evaluate.load_all (sc, conf). \ map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0, features = [ b.paraDist, b.sentDist, b.docDist ] ) ) # labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \ # map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0, # [ b, b * 2, b * 9 ] ) ) # print (labeled.collect ()) train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345) count = train.count () start = time.time () model = LogisticRegressionWithLBFGS.train (train) elapsed = time.time () - start print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed)) start = time.time () model_path = os.path.join (conf.output_dir, "eval", "model") file_path = model_path.replace ("file://", "") if os.path.isdir (file_path): print ("Removing existing model {0}".format (file_path)) shutil.rmtree (file_path) model.save(sc, model_path) sameModel = LogisticRegressionModel.load(sc, model_path) elapsed = time.time () - start print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed)) # Metrics labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count()) print("Training Error => {0}".format (trainErr)) predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) )) metrics = MulticlassMetrics (predictionsAndLabels) print (" --------------> {0}".format (predictionsAndLabels.take (1000))) #print (labelsAndPreds.collect ()) print ("\nMETRICS:") try: print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0))) print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0))) except: traceback.print_exc () try: print ("precision : {0}".format (metrics.precision(1.0))) except: traceback.print_exc () try: print ("recall : {0}".format (metrics.recall(1.0))) except: traceback.print_exc () try: print ("fMeasure : {0}".format (metrics.fMeasure(0.0, 2.0))) except: traceback.print_exc () print ("confusion matrix : {0}".format (metrics.confusionMatrix().toArray ())) print ("precision : {0}".format (metrics.precision())) print ("recall : {0}".format (metrics.recall())) print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate)) print ("weighted precision : {0}".format (metrics.weightedPrecision)) print ("weighted recall : {0}".format (metrics.weightedRecall)) print ("weight f measure : {0}".format (metrics.weightedFMeasure())) print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0))) print ("") # Regression metrics predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) ) regression_metrics = RegressionMetrics (predictedAndObserved) print ("explained variance......: {0}".format (regression_metrics.explainedVariance)) print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError)) print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError)) print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError)) print ("r2......................: {0}".format (regression_metrics.r2)) print ("") labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features))) testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ()) print ("Testing Error => {0}".format (testErr))
print("Usage: logistic_regression", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonLogisticRegressionExample") sqlContext = SQLContext(sc) # Load the data stored in LIBSVM format as a DataFrame. df = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt") # Map labels into an indexed column of labels in [0, numLabels) stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel") si_model = stringIndexer.fit(df) td = si_model.transform(df) [training, test] = td.randomSplit([0.7, 0.3]) lr = LogisticRegression(maxIter=100, regParam=0.3).setLabelCol("indexedLabel") lr.setElasticNetParam(0.8) # Fit the model lrModel = lr.fit(training) predictionAndLabels = lrModel.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = MulticlassMetrics(predictionAndLabels) print("weighted f-measure %.3f" % metrics.weightedFMeasure()) print("precision %s" % metrics.precision()) print("recall %s" % metrics.recall()) sc.stop()
#model = NaiveBayes.train(train,1.)#ナイベで #print(len(np.array(model.weights))) #テストデータの推定結果を取得 Re_l = res.collect() #評価用に(推定ラベル,正解ラベル)のペアのタプルが並んだリストを生成 #hako = [] #for i,v in enumerate(test.collect()): # hako.append((float(Re_l[i]), v.label)) # #そのリストをspark用に変換,メトリクスのクラスに渡す predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) ##7_12変更 metrics = MulticlassMetrics(predictionAndLabels) metrics2 = BinaryClassificationMetrics(predictionAndLabels) #メトリクスのメソッドを使って再現率と適合率を計算&出力 print("-------------------------------------") print("under PR = {}".format(metrics2.areaUnderPR)) #print("under ROC = {}".format(metrics2.areaUnderROC)) print("precision of ham = {}".format(metrics.precision(0.))) print("precision of spam = {}".format(metrics.precision(1.))) print("recall of ham = {}".format(metrics.recall(0.)))#スパムでないメールの認識率 print("recall of spam = {}".format(metrics.recall(1.)))#スパムメールの検出率 sc.stop()
# Evaluate model based on confusion matrix from pyspark.mllib.evaluation import MulticlassMetrics # model on training data regPara: lasso regularisation parameter (L1) lrModel = LogisticRegression(regParam=0.2).fit(trainData) # make prediction on test data pred = lrModel.transform(testData) pred.select('label', 'prediction').show() evaluator1 = BinaryClassificationEvaluator(labelCol='label', metricName="areaUnderROC") evaluator2 = MulticlassClassificationEvaluator(labelCol='label', metricName="f1") metrics = MulticlassMetrics(pred.select('label', 'prediction').rdd.map(tuple)) print('AUC ROC of Logistic Regression model is %f' % evaluator1.evaluate(pred)) print('F1 score of Logistic Regression model is %f' % evaluator2.evaluate(pred)) metrics.confusionMatrix().toArray().transpose() # <a id="context322"></a> # #### 3.2.2. Decision Tree # In[14]: from pyspark.ml.classification import DecisionTreeClassifier # model on training data maxDepth is the hyperparameter dtModel = DecisionTreeClassifier(maxDepth=3).fit(trainData)
# $example on$ # Load training data in LIBSVM format data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_multiclass_classification_data.txt") # Split data into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=11) training.cache() # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(training, numClasses=3) # Compute raw scores on the test set predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) # Statistics by class labels = data.map(lambda lp: lp.label).distinct().collect() for label in sorted(labels): print("Class %s precision = %s" % (label, metrics.precision(label))) print("Class %s recall = %s" % (label, metrics.recall(label)))
trainLabelsAndPreds = trainParsed.map( lambda p: (p.label, float(model.predict(p.features)))) trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count() / float( trainParsed.count()) print trainErr # Test Error testLabelsAndPreds = testParsed.map( lambda p: (p.label, float(model.predict(p.features)))) testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count() / float( testParsed.count()) print testErr metrics = BinaryClassificationMetrics(testLabelsAndPreds) print metrics.areaUnderROC print metrics.areaUnderPR mcMetrics = MulticlassMetrics(testLabelsAndPreds) #TODO: Do this for classes 1.0,0.0 and not just overall print mcMetrics.precision() print mcMetrics.recall() print mcMetrics.fMeasure() model.save(sc, "SVMModel") ### Run Model on Validation Set ## TODO: output file of zipcodes and predicted success metrics ## TODO: Use bokeh on file to make visualization of the US
# MAGIC %md # MAGIC We can also generate a Confusion Matrix to see the results of the predictions better. ConfusionMatrix() works only with RDDs, so we will have to convert our DataFrame of (prediction, label) into a RDD. # MAGIC # MAGIC confusionMatrix() returns a DenseMatrix with the columns representing the predicted class ordered by ascending class label, and each row represents the actual class ordered by ascending class label. The diagonal from top left to bottom right represents the observations that were predicted correctly. # MAGIC # MAGIC From the above confusion matrix, we observe that all Setosas (class 0) and Versicolors (class 1) have been classified correctly, but there are 10 Virginicas (class 2) that have been wrongly classified as Versicolors. # COMMAND ---------- from pyspark.mllib.evaluation import MulticlassMetrics # Create (prediction, label) pairs predictionAndLabel = predictions.select("prediction", "label").rdd # Generate confusion matrix metrics = MulticlassMetrics(predictionAndLabel) print metrics.confusionMatrix() # COMMAND ---------- # MAGIC %md # MAGIC ####Experimenting with Various Smoothing Parameters # MAGIC # MAGIC We can experiment with various smoothing parameters to see which returns the best result. This is easily done with the ParamGridBuilder and CrossValidator. # MAGIC # MAGIC As we indicate 6 values for the smoothing parameter, this grid will provide 6 parameter settings for CrossValidator to model, evaluate and choose from. # COMMAND ---------- from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# coding=utf-8 from pyspark import SparkContext, SparkConf from pyspark.mllib.evaluation import MulticlassMetrics from pyspark.mllib.util import MLUtils conf = SparkConf().setAppName('Multilabel Classification Evaluation').setMaster('local[2]') sc = SparkContext(conf=conf) scoreAndLabels = sc.parallelize([ # 此数据有误 ([0.0, 1.0], [0.0, 2.0]), ([0.0, 2.0], [0.0, 1.0]), ([], [0.0]), ([2.0], [2.0]), ([2.0, 0.0], [2.0, 0.0]), ([0.0, 1.0, 2.0], [0.0, 1.0]), ([1.0], [1.0, 2.0])]) # instantiate metrics object metrics = MulticlassMetrics(scoreAndLabels) # summary stats print('recall:', metrics.recall()) print('precision:', metrics.precision()) print('F1 measure:', metrics.fMeasure()) print('accuracy:', metrics.accuracy()) # individual label stats sc.stop()
model = SVMWithSGD.train(trainParsed, iterations=100) # Training Error trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features)))) trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count()) print trainErr # Test Error testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features)))) testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count()) print testErr metrics = BinaryClassificationMetrics(testLabelsAndPreds) print metrics.areaUnderROC print metrics.areaUnderPR mcMetrics = MulticlassMetrics(testLabelsAndPreds) #TODO: Do this for classes 1.0,0.0 and not just overall print mcMetrics.precision() print mcMetrics.recall() print mcMetrics.fMeasure() model.save(sc, "SVMModel") ### Run Model on Validation Set ## TODO: output file of zipcodes and predicted success metrics ## TODO: Use bokeh on file to make visualization of the US
testLabels = testnewsgroups.map(lambda x:newsgroupsMap[x]) testTf = testRDD.map(lambda (file,text): hashingTF.transform(tokenize(text))) testTfIdf= idf.transform(testTf) zippedTest = testLabels.zip(testTfIdf) test = zippedTest.map(lambda (topic,vector): LabeledPoint(topic,vector)) predictionAndLabel = test.map(lambda x: (model.predict(x.features),x.label)) accuracy = 1.0*predictionAndLabel.filter(lambda x: x[0]==x[1]).count()/test.count() metrics= MulticlassMetrics(predictionAndLabel) print (accuracy) print (metrics.weightedFMeasure()) #raw features rawTokens = rdd.map(lambda (file,text): text.split(" ")) rawTF=rawTokens.map(lambda doc: hashingTF.transform(doc)) rawTrain=newsgroups.zip(rawTF).map(lambda (topic,vector): LabeledPoint(newsgroupsMap(topic),vector))
def logisticRegression(trainFile, testFile, taskid, sc): # Load training data in LIBSVM format trainData = MLUtils.loadLibSVMFile(sc, trainFile) testData = MLUtils.loadLibSVMFile(sc, testFile) # Split data into training (60%) and test (40%) # traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L) # traindata.cache() # Load testing data in LIBSVM format #testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath) labelNum = trainData.map(lambda lp: lp.label).distinct().count() # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(trainData, numClasses=labelNum) # Compute raw scores on the test set predictionAndLabels = testData.map(lambda lp: (float(model.predict(lp.features)), lp.label)) Json.generateJson("LogisticRegression", taskid, trainData, predictionAndLabels); # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() #confusion_matrix = metrics.confusionMatrix().toArray() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score) # Statistics by class labels = trainData.map(lambda lp: lp.label).distinct().collect() for label in sorted(labels): print("Class %s precision = %s" % (label, metrics.precision(label))) print("Class %s recall = %s" % (label, metrics.recall(label))) print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) # Weighted stats print("Weighted recall = %s" % metrics.weightedRecall) print("Weighted precision = %s" % metrics.weightedPrecision) print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) # #return model parameters # res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)), # ('3','Yes','Precision', metrics.precision(0.0)), # ('4','Yes','Recall', metrics.recall(0.0)), # ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)), # ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)), # ('3','Yes','Precision', metrics.precision(1.0)), # ('4','Yes','Recall', metrics.recall(1.0)), # ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)), # ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)), # ('3','Yes','Precision', metrics.precision(2.0)), # ('4','Yes','Recall', metrics.recall(2.0)), # ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))] # #save output file path as JSON and dump into dumpFilePath # rdd = sc.parallelize(res) # SQLContext.createDataFrame(rdd).collect() # df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value']) #tempDumpFilePath = dumpFilePath + "/part-00000" #if os.path.exists(tempDumpFilePath): # os.remove(tempDumpFilePath) #df.toJSON().saveAsTextFile(hdfsFilePath) #tmpHdfsFilePath = hdfsFilePath + "/part-00000" #subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath]) # Save and load model #clusters.save(sc, "myModel") #sameModel = KMeansModel.load(sc, "myModel")
def evaluate(self, model=None, trainingData=None, testingData=None): """ Ham kiem thu model, in ra man hinh do do chinh xac va thoi gian tinh toan """ time_train = 0 time_test = 0 if (not trainingData): trainingData = self.trainingData if (not testingData): testingData = self.testingData if (not model): # Train model print("Training...") start_train = datetime.now() model = self.trainModel(trainingData) time_train = datetime.now() - start_train #print("Num nodes: ", model.stages[2].totalNumNodes, "\n", model.stages[2].toDebugString, file=open("modelDebug.txt","w")) # Make predictions print("Testing...") start_test = datetime.now() predictions = model.transform(testingData) time_test = datetime.now() - start_test # Evaluation for flow print("{:*^100}".format("")) print("Training time: ", time_train) print("Testing time: ", time_test) featureImportances = {} fi = model.stages[2].featureImportances features = loadcols(self.dataset) index = 0 for value in fi: featureImportances[features[index]] = value index = index + 1 fiSorted = sorted(featureImportances.items(), key=lambda x: x[1], reverse=True) print("{:*^100}".format(" Feature Importances ")) f = open("features_importance.txt", "w") for feature in fiSorted: if feature[1] > 0.000: print("{!s} : {:.4%}".format(feature[0].strip(), feature[1])) f.write("{!s}\n".format(feature[0].strip())) f.close() print("{:*^100}".format(" Evaluate for Flow ")) print("Total predictions:", predictions.count()) predictions.select("prediction", "indexedLabel", "label").groupBy("label").count().show() predictionAndLabels = predictions.select("prediction", "indexedLabel").rdd metrics = MulticlassMetrics(predictionAndLabels) print("Confusion Matrix:") for line in metrics.confusionMatrix().toArray(): print(line) print("TPR: {:.3%} \tFPR: {:.3%}".format( metrics.truePositiveRate(1.0), metrics.falsePositiveRate(1.0))) print("TNR: {:.3%} \tFNR: {:.3%}".format( metrics.truePositiveRate(0.0), metrics.falsePositiveRate(0.0))) print("Precision: {:.3%} \tRecall: {:.3%} \tAccuracy: {:.3%}".format( metrics.precision(1.0), metrics.recall(1.0), metrics.accuracy)) print(metrics.accuracy) print("{:*^100}".format(""))
def test_prfs(): # TODO: revised so that it will take user's inputs instead of hardcoded values """ Test Precision, Recall, Fscore, and Support on multiclass classification data Input data: https://github.com/apache/spark/blob/master/data/mllib/sample_multiclass_classification_data.txt. """ # load the schemas (if existed) # create a hdfs directory #os.system("hdfs dfs -mkdir datasets") # load the data file into the hdfs directory os.system("hdfs dfs -put sample_multiclass_classification_data.txt datasets/sample_multiclass_classification_data.txt") data = MLUtils.loadLibSVMFile(scsingleton.sc, "hdfs://localhost:9000/datasets/sample_multiclass_classification_data.txt") # print data.take(1) # ie. [LabeledPoint(1.0, (4,[0,1,2,3],[-0.222222,0.5,-0.762712,-0.833333]))] # [ ( finalClassification, (numLabels, [label0, label1, label2, ..., labelN], [prob0, prob1, prob2, ..., probN]) ) ] # split data into train (60%), test (40%) trainingRDD, testRDD = data.randomSplit([0.6, 0.4]) trainingRDD.cache() testRDD.cache() with Timer() as t: numTest = testRDD.count() print "testRDD.count(): %s seconds" % t.secs # run training algorithm to build the model # without validation with Timer() as t: model = LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3) print "LogisticRegressionWithLBFGS.train(trainingRDD, numClasses=3): %s seconds" % t.secs # make a prediction with Timer() as t: testPredAndLabel = testRDD.map(lambda lp: (float(model.predict(lp.features)), lp.label)) print "testPredAndLabel: %s seconds" % t.secs # calculate Precision, Recall, F1-score metrics = MulticlassMetrics(testPredAndLabel) print( "precision = %s" % metrics.precision() ) print( "recall = %s" % metrics.recall() ) print( "f1-score = %s" % metrics.fMeasure() ) # statistics by class labels = data.map(lambda lp: lp.label).distinct().collect() for label in sorted(labels): print( "Class %s precision = %s" % (label, metrics.precision(label)) ) print( "Class %s recall = %s" % (label, metrics.recall(label)) ) print( "Class %s f1-score = %s" % (label, metrics.fMeasure(label, beta=1.0)) ) # weighted stats print( "Weighted precision = %s" % metrics.weightedPrecision ) print( "Weighted recall = %s" % metrics.weightedRecall ) print( "Weighted f1-score = %s" % metrics.weightedFMeasure() ) print( "Weighted f(0.5)-score = %s" % metrics.weightedFMeasure(beta=0.5) ) print( "Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate ) return