def label_evaluation(test_data, predicted_labels): gold_labels = flatten([flatten(i["gold_labels"]) for i in test_data]) gold_labels = [1 if i else 0 for i in gold_labels] metric_evaluation = Evaluator() metric_evaluation.compute_all(gold_labels, predicted_labels) log.write("Confusion Matrix :") log.write(metric_evaluation.confusion_matrix) log.write("Accuracy = %f" % metric_evaluation.accuracy) log.write("Precision = %f" % metric_evaluation.precision) log.write("Recall = %f" % metric_evaluation.recall) log.write("F1 Score = %f" % metric_evaluation.f1_score)