class MultiLabelMultiClassEvaluator(Evaluator): """ An evaluator for multiclass classification results, where an example can belong to one of several classes. For calculating averages over multiple classes, one of the classes, "neg"/1 is considered to be negative while the others are considered to be different types of positive instances. """ type = "multiclass" def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet self.dataByClass = defaultdict(EvaluationData) #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions) @classmethod def evaluate(cls, examples, predictions, classSet=None, outputFile=None): """ Enables using this class without having to manually instantiate it """ evaluator = cls(examples, predictions, classSet) print >> sys.stderr, evaluator.toStringConcise() if outputFile != None: evaluator.saveCSV(outputFile) return evaluator def compare(self, evaluation): if self.microF.fscore > evaluation.microF.fscore: return 1 elif self.microF.fscore == evaluation.microF.fscore: return 0 else: return -1 def getData(self): return self.microF def _queueUntypedUndirected(self, example, prediction): """ All examples within the same majorId (same sentence) are put in queue. Once major id (sentence) changes, these examples are processed. """ majorId, minorId = example[0].rsplit(".x", 1) if majorId != self.untypedCurrentMajorId: # new sentence self._processUntypedUndirectedQueue() # process queue self.untypedCurrentMajorId = majorId self.untypedPredictionQueue.append( (example, prediction) ) # queue example def _processUntypedUndirectedQueue(self): """ Determines the untyped undirected performance by merging example pairs. This statistic is only meaningful for examples representing directed edges where two consecutive examples are the two directed edges between a pair of nodes. """ prevExample = None prevPrediction = None for example, prediction in self.untypedPredictionQueue: majorId, minorId = example[0].rsplit(".x", 1) if prevExample != None and prevPrediction != None and int(minorId) % 2 != 0: # A positive example in either direction counts as a positive if example[1] != 1 or prevExample[1] != 1: # 1 is the multiclass "neg" class id trueClass = 1 # binary positive class else: trueClass = -1 # binary negative class # A positive prediction in either direction counts as a positive if prediction[0] != 1 or prevPrediction[0] != 1: predictedClass = 1 else: predictedClass = -1 self.untypedUndirected.addInstance(trueClass == 1, predictedClass == 1) prevExample = example prevPrediction = prediction self.untypedPredictionQueue = [] # clear the queue def _calculate(self, examples, predictions): """ The actual evaluation """ #self._calculateUntypedUndirected(examples, predictions) # First count instances self.microF = EvaluationData() self.binaryF = EvaluationData() #self.classifications = [] #assert(len(examples) == len(predictions)) #for i in range(len(examples)): falsePredictions = [] truePredictions = [] classes = set() for example, prediction in itertools.izip(examples, predictions): # self._queueUntypedUndirected(example, prediction) #example = examples[i] # examples and predictions are in matching lists #prediction = predictions[i] # examples and predictions are in matching lists trueClass = example[1] assert(trueClass > 0) # multiclass classification uses non-negative integers classes.add(trueClass) for i in range(2, len(prediction)): if prediction[i] < prediction[1]: continue predictedClass = i #print predictedClass assert(predictedClass > 0) # multiclass classification uses non-negative integers if predictedClass == trueClass: # correct classification # correctly classified for its class -> true positive for that class self.dataByClass[trueClass].addTP() if trueClass != 1: # a non-negative example -> correct = true positive self.microF.addTP() self.binaryF.addTP() else: # a negative example -> correct = true negative self.microF.addTN() self.binaryF.addTN() truePredictions.append(trueClass) else: # predictedClass != trueClass: # prediction was incorrect -> false positive for the predicted class self.dataByClass[predictedClass].addFP() if predictedClass == 1: # non-negative example, negative prediction -> incorrect = false negative self.microF.addFN() self.binaryF.addFN() else: # non-negative incorrect prediction -> false positive self.microF.addFP() if trueClass == 1: self.binaryF.addFP() else: self.binaryF.addTP() falsePredictions.append((trueClass, predictedClass)) # add negatives for other classes classes = sorted(list(classes)) for falsePrediction in falsePredictions: for cls in classes: if cls == falsePrediction[0]: # example not found -> false negative self.dataByClass[cls].addFN() elif cls != falsePrediction[1]: self.dataByClass[cls].addTN() for truePrediction in truePredictions: for cls in classes: if cls != truePrediction: self.dataByClass[cls].addTN() # Process remaining untyped undirected examples and calculate untyped undirected f-score # self._processUntypedUndirectedQueue() # self.untypedUndirected.calculateFScore() # Then calculate statistics for cls in self.dataByClass: #self.classes: self.dataByClass[cls].calculateFScore() self.microF.calculateFScore() self.binaryF.calculateFScore() # Finally calculate macro-f-score # macro-average is simply the unweighted average of per-class f-scores numClassesWithInstances = 0 self.macroF = EvaluationData() self.macroF.precision = 0.0 self.macroF.recall = 0.0 self.macroF.fscore = 0.0 for cls in classes: if (self.dataByClass[cls].getNumInstances() > 0 or self.dataByClass[cls].getFP() > 0) and cls != self.getNegativeClassId(): numClassesWithInstances += 1 self.macroF.precision += self.dataByClass[cls].precision self.macroF.recall += self.dataByClass[cls].recall if self.dataByClass[cls].fscore != "N/A": self.macroF.fscore += self.dataByClass[cls].fscore if numClassesWithInstances > 0: if self.macroF.precision != 0: self.macroF.precision /= float(numClassesWithInstances) if self.macroF.recall != 0: self.macroF.recall /= float(numClassesWithInstances) if self.macroF.fscore != 0: self.macroF.fscore /= float(numClassesWithInstances) def getNegativeClassId(self): negativeClassId = None if self.classSet != None: return self.classSet.getId("neg", False) else: classIds = sorted(self.dataByClass.keys()) if -1 in classIds: negativeClassId = -1 elif 1 in classIds: assert negativeClassId != -1 negativeClassId = 1 return negativeClassId def toStringConcise(self, indent="", title=None): """ Evaluation results in a human readable string format """ if title != None: string = indent + title + "\n" indent += " " string += indent else: string = indent negativeClassId = self.getNegativeClassId() if self.classSet != None: classNames = sorted(self.classSet.Ids.keys()) for className in classNames: if className != "neg": string += className string += " " + self.dataByClass[self.classSet.getId(className, False)].toStringConcise() + "\n" + indent else: classIds = sorted(self.dataByClass.keys()) for classId in classIds: if classId != negativeClassId: string += str(classId) string += " " + self.dataByClass[classId].toStringConcise() + "\n" + indent if negativeClassId != None: string += "(neg " + self.dataByClass[negativeClassId].toStringConcise() + ")\n" + indent string += "averages:\n" + indent # Micro results string += "micro " + self.microF.toStringConcise() + "\n" + indent # Macro results string += "macro " + self.macroF.prfToString() + "\n" + indent # Binary results string += "untyped " + self.binaryF.toStringConcise() # Untyped undirected results if self.untypedUndirected != None: string += "\n" + indent string += "untyped undirected " + self.untypedUndirected.toStringConcise() return string # def __addClassToCSV(self, csvWriter, cls): # values = [] # values.append( self.classSet.getName(cls) ) # values.append( self.truePositivesByClass[cls]+self.falseNegativesByClass[cls] ) # values.append( self.trueNegativesByClass[cls]+self.falsePositivesByClass[cls] ) # values.append(self.truePositivesByClass[cls]) # values.append(self.falsePositivesByClass[cls]) # values.append(self.trueNegativesByClass[cls]) # values.append(self.falseNegativesByClass[cls]) # if self.instancesByClass[cls] > 0 or self.falsePositivesByClass[cls] > 0: # values.append(self.precisionByClass[cls]) # values.append(self.recallByClass[cls]) # values.append(self.fScoreByClass[cls]) # else: # values.extend(["N/A","N/A","N/A"]) # csvWriter.writerow(values) # def toDict(self): """ Evaluation results in a computationally easy to process dictionary format """ dicts = [] if len(self.classes) > 0: assert(not ("1" in self.classSet.getNames() and "neg" in self.classSet.getNames())) negativeClassId = None for cls in self.classes: if cls != self.classSet.getId("neg", False) and cls != self.classSet.getId("1", False): values = self.dataByClass[cls].toDict() values["class"] = self.classSet.getName(cls) dicts.append(values) else: assert(negativeClassId == None) negativeClassId = cls if negativeClassId != None: values = self.dataByClass[negativeClassId].toDict() values["class"] = "neg" dicts.append(values) dicts.append( self.microF.toDict() ) dicts[-1]["class"] = "micro" dicts.append( self.macroF.toDict() ) dicts[-1]["class"] = "macro" dicts.append( self.binaryF.toDict() ) dicts[-1]["class"] = "untyped" if self.untypedUndirected != None: dicts.append(self.untypedUndirected.toDict()) dicts[-1]["class"] = "untyped undirected" return dicts
class AveragingMultiClassEvaluator(Evaluator): """ An evaluator for multiclass classification results, where an example can belong to one of several classes. For calculating averages over multiple classes, one of the classes, "neg"/1 is considered to be negative while the others are considered to be different types of positive instances. """ type = "multiclass" def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = sorted(classSet.Ids.keys()) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() # hack for unnamed classes if len(self.dataByClass) == 0: self.dataByClass[1] = EvaluationData() self.dataByClass[2] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions) @classmethod def evaluate(cls, examples, predictions, classSet=None, outputFile=None): """ Enables using this class without having to manually instantiate it """ evaluator = cls(examples, predictions, classSet) print >> sys.stderr, evaluator.toStringConcise() if outputFile != None: evaluator.saveCSV(outputFile) return evaluator def compare(self, evaluation): if self.microF.fscore > evaluation.microF.fscore: return 1 elif self.microF.fscore == evaluation.microF.fscore: return 0 else: return -1 def getData(self): return self.microF # def pool(evaluators): # predictions = [] # for evaluator in evaluators: # assert(isinstance(evaluator,AveragingMultiClassEvaluator)) # predictions.extend(evaluator.predictions) # return AveragingMultiClassEvaluator(predictions, evaluators[0].classSet) # pool = staticmethod(pool) # # def average(evaluators): # averageEvaluator = AveragingMultiClassEvaluator(None, None) # averageEvaluator.microPrecision = 0 # averageEvaluator.microRecall = 0 # averageEvaluator.microFScore = 0 # averageEvaluator.macroPrecision = 0 # averageEvaluator.macroRecall = 0 # averageEvaluator.macroFScore = 0 # averageEvaluator.truePositives = "-" # averageEvaluator.falsePositives = "-" # averageEvaluator.trueNegatives = "-" # averageEvaluator.falseNegatives = "-" # sumWeight = 0.0 # for evaluator in evaluators: # assert(isinstance(evaluator,AveragingMultiClassEvaluator)) # weight = float(len(evaluator.predictions)) # sumWeight += weight # averageEvaluator.macroPrecision += weight * evaluator.macroPrecision # averageEvaluator.macroRecall += weight * evaluator.macroRecall # averageEvaluator.macroFScore += weight * evaluator.macroFScore # averageEvaluator.microPrecision += weight * evaluator.microPrecision # averageEvaluator.microRecall += weight * evaluator.microRecall # averageEvaluator.microFScore += weight * evaluator.microFScore # averageEvaluator.macroPrecision /= sumWeight # averageEvaluator.macroRecall /= sumWeight # averageEvaluator.macroFScore /= sumWeight # averageEvaluator.microPrecision /= sumWeight # averageEvaluator.microRecall /= sumWeight # averageEvaluator.microFScore /= sumWeight # return averageEvaluator # average = staticmethod(average) def _queueUntypedUndirected(self, example, prediction): """ All examples within the same majorId (same sentence) are put in queue. Once major id (sentence) changes, these examples are processed. """ majorId, minorId = example[0].rsplit(".x", 1) if majorId != self.untypedCurrentMajorId: # new sentence self._processUntypedUndirectedQueue() # process queue self.untypedCurrentMajorId = majorId self.untypedPredictionQueue.append( (example, prediction)) # queue example def _processUntypedUndirectedQueue(self): """ Determines the untyped undirected performance by merging example pairs. This statistic is only meaningful for examples representing directed edges where two consecutive examples are the two directed edges between a pair of nodes. """ prevExample = None prevPrediction = None for example, prediction in self.untypedPredictionQueue: majorId, minorId = example[0].rsplit(".x", 1) if prevExample != None and prevPrediction != None and int( minorId) % 2 != 0: # A positive example in either direction counts as a positive if example[1] != 1 or prevExample[ 1] != 1: # 1 is the multiclass "neg" class id trueClass = 1 # binary positive class else: trueClass = -1 # binary negative class # A positive prediction in either direction counts as a positive if prediction[0] != 1 or prevPrediction[0] != 1: predictedClass = 1 else: predictedClass = -1 self.untypedUndirected.addInstance(trueClass == 1, predictedClass == 1) prevExample = example prevPrediction = prediction self.untypedPredictionQueue = [] # clear the queue # def _calculateUntypedUndirected(self, examples, predictions): # untypedUndirectedPredictions = [] # predictionsById = {} # for i in range(len(examples)): # id = examples[i][0] # if id != None and id != "": # majorId, minorId = id.rsplit(".x", 1) # if not predictionsById.has_key(majorId): # predictionsById[majorId] = {} # predictionsById[majorId][int(minorId)] = (examples[i], predictions[i]) # for majorId in sorted(predictionsById.keys()): # prevPrediction = None # for minorId in sorted(predictionsById[majorId]): # prediction = predictionsById[majorId][minorId] # if prevPrediction != None and minorId % 2 != 0: # if prediction[0][1] != 1 or prevPrediction[0][1] != 1: # trueClass = 1 # else: # trueClass = -1 # if prediction[1][0] != 1 or prevPrediction[1][0] != 1: # predictedClass = 1 # else: # predictedClass = -1 # untypedUndirectedPredictions.append( ((None,trueClass),predictedClass) ) # prevPrediction = prediction # if len(untypedUndirectedPredictions) > 0: # self.untypedUndirected = BinaryEvaluator(untypedUndirectedPredictions) def _calculate(self, examples, predictions): """ The actual evaluation """ #self._calculateUntypedUndirected(examples, predictions) # First count instances self.microF = EvaluationData() self.binaryF = EvaluationData() #self.classifications = [] #assert(len(examples) == len(predictions)) #for i in range(len(examples)): for example, prediction in itertools.izip(examples, predictions): # self._queueUntypedUndirected(example, prediction) #example = examples[i] # examples and predictions are in matching lists #prediction = predictions[i] # examples and predictions are in matching lists trueClass = example[1] assert (trueClass > 0 ) # multiclass classification uses non-negative integers predictedClass = prediction[0] #print predictedClass assert (predictedClass > 0 ) # multiclass classification uses non-negative integers if predictedClass == trueClass: # correct classification # correctly classified for its class -> true positive for that class self.dataByClass[trueClass].addTP() if trueClass != 1: # a non-negative example -> correct = true positive #self.classifications.append("tp") #self.classifications.append((prediction[0],"tp",self.type,prediction[1],prediction[3])) self.microF.addTP() self.binaryF.addTP() else: # a negative example -> correct = true negative #self.classifications.append((prediction[0],"tn",self.type,prediction[1],prediction[3])) #self.classifications.append("tn") self.microF.addTN() self.binaryF.addTN() for cls in self.classes: # this example was correctly classified for its class, # so it is also correctly classified for each class, # i.e. true negative for them if cls != trueClass: self.dataByClass[cls].addTN() else: # predictedClass != trueClass: # prediction was incorrect -> false positive for the predicted class self.dataByClass[predictedClass].addFP() if predictedClass == 1: # non-negative example, negative prediction -> incorrect = false negative #self.classifications.append("fn") #self.classifications.append((prediction[0],"fn",self.type,prediction[1],prediction[3])) self.microF.addFN() self.binaryF.addFN() else: # non-negative incorrect prediction -> false positive #self.classifications.append("fp") #self.classifications.append((prediction[0],"fp",self.type,prediction[1],prediction[3])) self.microF.addFP() if trueClass == 1: self.binaryF.addFP() else: self.binaryF.addTP() for cls in self.classes: if cls == trueClass: # example not found -> false negative self.dataByClass[cls].addFN() elif cls != predictedClass: self.dataByClass[cls].addTN() # Process remaining untyped undirected examples and calculate untyped undirected f-score # self._processUntypedUndirectedQueue() # self.untypedUndirected.calculateFScore() # Then calculate statistics for cls in self.classes: self.dataByClass[cls].calculateFScore() self.microF.calculateFScore() self.binaryF.calculateFScore() # Finally calculate macro-f-score # macro-average is simply the unweighted average of per-class f-scores numClassesWithInstances = 0 self.macroF = EvaluationData() self.macroF.precision = 0.0 self.macroF.recall = 0.0 self.macroF.fscore = 0.0 for cls in self.classes: if (self.dataByClass[cls].getNumInstances() > 0 or self.dataByClass[cls].getFP() > 0 ) and cls != self.classSet.getId("neg", False): numClassesWithInstances += 1 self.macroF.precision += self.dataByClass[cls].precision self.macroF.recall += self.dataByClass[cls].recall if self.dataByClass[cls].fscore != "N/A": self.macroF.fscore += self.dataByClass[cls].fscore if numClassesWithInstances > 0: if self.macroF.precision != 0: self.macroF.precision /= float(numClassesWithInstances) if self.macroF.recall != 0: self.macroF.recall /= float(numClassesWithInstances) if self.macroF.fscore != 0: self.macroF.fscore /= float(numClassesWithInstances) def toStringConcise(self, indent="", title=None): """ Evaluation results in a human readable string format """ if title != None: string = indent + title + "\n" indent += " " string += indent else: string = indent negativeClassId = None for cls in self.classes: if cls != self.classSet.getId("neg", False): string += self.classSet.getName(cls) string += " " + self.dataByClass[cls].toStringConcise( ) + "\n" + indent else: negativeClassId = cls if negativeClassId != None: cls = negativeClassId string += "(neg " + self.dataByClass[cls].toStringConcise( ) + ")\n" + indent string += "averages:\n" + indent # Micro results string += "micro " + self.microF.toStringConcise() + "\n" + indent # Macro results string += "macro " + self.macroF.prfToString() + "\n" + indent # Binary results string += "untyped " + self.binaryF.toStringConcise() # Untyped undirected results if self.untypedUndirected != None: string += "\n" + indent string += "untyped undirected " + self.untypedUndirected.toStringConcise( ) return string # def __addClassToCSV(self, csvWriter, cls): # values = [] # values.append( self.classSet.getName(cls) ) # values.append( self.truePositivesByClass[cls]+self.falseNegativesByClass[cls] ) # values.append( self.trueNegativesByClass[cls]+self.falsePositivesByClass[cls] ) # values.append(self.truePositivesByClass[cls]) # values.append(self.falsePositivesByClass[cls]) # values.append(self.trueNegativesByClass[cls]) # values.append(self.falseNegativesByClass[cls]) # if self.instancesByClass[cls] > 0 or self.falsePositivesByClass[cls] > 0: # values.append(self.precisionByClass[cls]) # values.append(self.recallByClass[cls]) # values.append(self.fScoreByClass[cls]) # else: # values.extend(["N/A","N/A","N/A"]) # csvWriter.writerow(values) # def toDict(self): """ Evaluation results in a computationally easy to process dictionary format """ dicts = [] if len(self.classes) > 0: assert (not ("1" in self.classSet.getNames() and "neg" in self.classSet.getNames())) negativeClassId = None for cls in self.classes: if cls != self.classSet.getId( "neg", False) and cls != self.classSet.getId("1", False): values = self.dataByClass[cls].toDict() values["class"] = self.classSet.getName(cls) dicts.append(values) else: assert (negativeClassId == None) negativeClassId = cls if negativeClassId != None: values = self.dataByClass[negativeClassId].toDict() values["class"] = "neg" dicts.append(values) dicts.append(self.microF.toDict()) dicts[-1]["class"] = "micro" dicts.append(self.macroF.toDict()) dicts[-1]["class"] = "macro" dicts.append(self.binaryF.toDict()) dicts[-1]["class"] = "untyped" if self.untypedUndirected != None: dicts.append(self.untypedUndirected.toDict()) dicts[-1]["class"] = "untyped undirected" return dicts
class MultiLabelMultiClassEvaluator(Evaluator): """ An evaluator for multiclass classification results, where an example can belong to one of several classes. For calculating averages over multiple classes, one of the classes, "neg"/1 is considered to be negative while the others are considered to be different types of positive instances. """ type = "multiclass" def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet self.dataByClass = defaultdict(EvaluationData) #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions) @classmethod def evaluate(cls, examples, predictions, classSet=None, outputFile=None): """ Enables using this class without having to manually instantiate it """ evaluator = cls(examples, predictions, classSet) print >> sys.stderr, evaluator.toStringConcise() if outputFile != None: evaluator.saveCSV(outputFile) return evaluator def compare(self, evaluation): if self.microF.fscore > evaluation.microF.fscore: return 1 elif self.microF.fscore == evaluation.microF.fscore: return 0 else: return -1 def getData(self): return self.microF def _queueUntypedUndirected(self, example, prediction): """ All examples within the same majorId (same sentence) are put in queue. Once major id (sentence) changes, these examples are processed. """ majorId, minorId = example[0].rsplit(".x", 1) if majorId != self.untypedCurrentMajorId: # new sentence self._processUntypedUndirectedQueue() # process queue self.untypedCurrentMajorId = majorId self.untypedPredictionQueue.append( (example, prediction)) # queue example def _processUntypedUndirectedQueue(self): """ Determines the untyped undirected performance by merging example pairs. This statistic is only meaningful for examples representing directed edges where two consecutive examples are the two directed edges between a pair of nodes. """ prevExample = None prevPrediction = None for example, prediction in self.untypedPredictionQueue: majorId, minorId = example[0].rsplit(".x", 1) if prevExample != None and prevPrediction != None and int( minorId) % 2 != 0: # A positive example in either direction counts as a positive if example[1] != 1 or prevExample[ 1] != 1: # 1 is the multiclass "neg" class id trueClass = 1 # binary positive class else: trueClass = -1 # binary negative class # A positive prediction in either direction counts as a positive if prediction[0] != 1 or prevPrediction[0] != 1: predictedClass = 1 else: predictedClass = -1 self.untypedUndirected.addInstance(trueClass == 1, predictedClass == 1) prevExample = example prevPrediction = prediction self.untypedPredictionQueue = [] # clear the queue def _calculate(self, examples, predictions): """ The actual evaluation """ #self._calculateUntypedUndirected(examples, predictions) # First count instances self.microF = EvaluationData() self.binaryF = EvaluationData() #self.classifications = [] #assert(len(examples) == len(predictions)) #for i in range(len(examples)): falsePredictions = [] truePredictions = [] classes = set() for example, prediction in itertools.izip(examples, predictions): # self._queueUntypedUndirected(example, prediction) #example = examples[i] # examples and predictions are in matching lists #prediction = predictions[i] # examples and predictions are in matching lists trueClass = example[1] assert (trueClass > 0 ) # multiclass classification uses non-negative integers classes.add(trueClass) for i in range(2, len(prediction)): if prediction[i] < prediction[1]: continue predictedClass = i #print predictedClass assert ( predictedClass > 0 ) # multiclass classification uses non-negative integers if predictedClass == trueClass: # correct classification # correctly classified for its class -> true positive for that class self.dataByClass[trueClass].addTP() if trueClass != 1: # a non-negative example -> correct = true positive self.microF.addTP() self.binaryF.addTP() else: # a negative example -> correct = true negative self.microF.addTN() self.binaryF.addTN() truePredictions.append(trueClass) else: # predictedClass != trueClass: # prediction was incorrect -> false positive for the predicted class self.dataByClass[predictedClass].addFP() if predictedClass == 1: # non-negative example, negative prediction -> incorrect = false negative self.microF.addFN() self.binaryF.addFN() else: # non-negative incorrect prediction -> false positive self.microF.addFP() if trueClass == 1: self.binaryF.addFP() else: self.binaryF.addTP() falsePredictions.append((trueClass, predictedClass)) # add negatives for other classes classes = sorted(list(classes)) for falsePrediction in falsePredictions: for cls in classes: if cls == falsePrediction[ 0]: # example not found -> false negative self.dataByClass[cls].addFN() elif cls != falsePrediction[1]: self.dataByClass[cls].addTN() for truePrediction in truePredictions: for cls in classes: if cls != truePrediction: self.dataByClass[cls].addTN() # Process remaining untyped undirected examples and calculate untyped undirected f-score # self._processUntypedUndirectedQueue() # self.untypedUndirected.calculateFScore() # Then calculate statistics for cls in self.dataByClass: #self.classes: self.dataByClass[cls].calculateFScore() self.microF.calculateFScore() self.binaryF.calculateFScore() # Finally calculate macro-f-score # macro-average is simply the unweighted average of per-class f-scores numClassesWithInstances = 0 self.macroF = EvaluationData() self.macroF.precision = 0.0 self.macroF.recall = 0.0 self.macroF.fscore = 0.0 for cls in classes: if (self.dataByClass[cls].getNumInstances() > 0 or self.dataByClass[cls].getFP() > 0 ) and cls != self.getNegativeClassId(): numClassesWithInstances += 1 self.macroF.precision += self.dataByClass[cls].precision self.macroF.recall += self.dataByClass[cls].recall if self.dataByClass[cls].fscore != "N/A": self.macroF.fscore += self.dataByClass[cls].fscore if numClassesWithInstances > 0: if self.macroF.precision != 0: self.macroF.precision /= float(numClassesWithInstances) if self.macroF.recall != 0: self.macroF.recall /= float(numClassesWithInstances) if self.macroF.fscore != 0: self.macroF.fscore /= float(numClassesWithInstances) def getNegativeClassId(self): negativeClassId = None if self.classSet != None: return self.classSet.getId("neg", False) else: classIds = sorted(self.dataByClass.keys()) if -1 in classIds: negativeClassId = -1 elif 1 in classIds: assert negativeClassId != -1 negativeClassId = 1 return negativeClassId def toStringConcise(self, indent="", title=None): """ Evaluation results in a human readable string format """ if title != None: string = indent + title + "\n" indent += " " string += indent else: string = indent negativeClassId = self.getNegativeClassId() if self.classSet != None: classNames = sorted(self.classSet.Ids.keys()) for className in classNames: if className != "neg": string += className string += " " + self.dataByClass[self.classSet.getId( className, False)].toStringConcise() + "\n" + indent else: classIds = sorted(self.dataByClass.keys()) for classId in classIds: if classId != negativeClassId: string += str(classId) string += " " + self.dataByClass[classId].toStringConcise( ) + "\n" + indent if negativeClassId != None: string += "(neg " + self.dataByClass[ negativeClassId].toStringConcise() + ")\n" + indent string += "averages:\n" + indent # Micro results string += "micro " + self.microF.toStringConcise() + "\n" + indent # Macro results string += "macro " + self.macroF.prfToString() + "\n" + indent # Binary results string += "untyped " + self.binaryF.toStringConcise() # Untyped undirected results if self.untypedUndirected != None: string += "\n" + indent string += "untyped undirected " + self.untypedUndirected.toStringConcise( ) return string # def __addClassToCSV(self, csvWriter, cls): # values = [] # values.append( self.classSet.getName(cls) ) # values.append( self.truePositivesByClass[cls]+self.falseNegativesByClass[cls] ) # values.append( self.trueNegativesByClass[cls]+self.falsePositivesByClass[cls] ) # values.append(self.truePositivesByClass[cls]) # values.append(self.falsePositivesByClass[cls]) # values.append(self.trueNegativesByClass[cls]) # values.append(self.falseNegativesByClass[cls]) # if self.instancesByClass[cls] > 0 or self.falsePositivesByClass[cls] > 0: # values.append(self.precisionByClass[cls]) # values.append(self.recallByClass[cls]) # values.append(self.fScoreByClass[cls]) # else: # values.extend(["N/A","N/A","N/A"]) # csvWriter.writerow(values) # def toDict(self): """ Evaluation results in a computationally easy to process dictionary format """ dicts = [] if len(self.classes) > 0: assert (not ("1" in self.classSet.getNames() and "neg" in self.classSet.getNames())) negativeClassId = None for cls in self.classes: if cls != self.classSet.getId( "neg", False) and cls != self.classSet.getId("1", False): values = self.dataByClass[cls].toDict() values["class"] = self.classSet.getName(cls) dicts.append(values) else: assert (negativeClassId == None) negativeClassId = cls if negativeClassId != None: values = self.dataByClass[negativeClassId].toDict() values["class"] = "neg" dicts.append(values) dicts.append(self.microF.toDict()) dicts[-1]["class"] = "micro" dicts.append(self.macroF.toDict()) dicts[-1]["class"] = "macro" dicts.append(self.binaryF.toDict()) dicts[-1]["class"] = "untyped" if self.untypedUndirected != None: dicts.append(self.untypedUndirected.toDict()) dicts[-1]["class"] = "untyped undirected" return dicts
class MultiLabelEvaluator(Evaluator): """ An evaluator for multiclass classification results, where an example can belong to one of several classes. For calculating averages over multiple classes, one of the classes, "neg"/1 is considered to be negative while the others are considered to be different types of positive instances. """ type = "multiclass" def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = [] for className in sorted(classSet.Ids.keys()): if "---" not in className: classNames.append(className) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = None # EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions) self.thresholds = None @classmethod def evaluate(cls, examples, predictions, classSet=None, outputFile=None): """ Enables using this class without having to manually instantiate it """ evaluator = cls(examples, predictions, classSet) print >> sys.stderr, evaluator.toStringConcise() if outputFile != None: evaluator.saveCSV(outputFile) return evaluator def compare(self, evaluation): if self.microF.fscore > evaluation.microF.fscore: return 1 elif self.microF.fscore == evaluation.microF.fscore: return 0 else: return -1 def getData(self): return self.microF def _queueUntypedUndirected(self, example, prediction): """ All examples within the same majorId (same sentence) are put in queue. Once major id (sentence) changes, these examples are processed. """ majorId, minorId = example[0].rsplit(".x", 1) if majorId != self.untypedCurrentMajorId: # new sentence self._processUntypedUndirectedQueue() # process queue self.untypedCurrentMajorId = majorId self.untypedPredictionQueue.append( (example, prediction)) # queue example def _processUntypedUndirectedQueue(self): """ Determines the untyped undirected performance by merging example pairs. This statistic is only meaningful for examples representing directed edges where two consecutive examples are the two directed edges between a pair of nodes. """ prevExample = None prevPrediction = None for example, prediction in self.untypedPredictionQueue: majorId, minorId = example[0].rsplit(".x", 1) if prevExample != None and prevPrediction != None and int( minorId) % 2 != 0: # A positive example in either direction counts as a positive if example[1] != 1 or prevExample[ 1] != 1: # 1 is the multiclass "neg" class id trueClass = 1 # binary positive class else: trueClass = -1 # binary negative class # A positive prediction in either direction counts as a positive if prediction[0] != 1 or prevPrediction[0] != 1: predictedClass = 1 else: predictedClass = -1 self.untypedUndirected.addInstance(trueClass == 1, predictedClass == 1) prevExample = example prevPrediction = prediction self.untypedPredictionQueue = [] # clear the queue def determineThreshold(self, examples, predictions): if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) examplesByClass = {} for cls in self.classes: examplesByClass[cls] = [] # prepare examples for example, prediction in itertools.izip(examples, predictions): # Check true class for multilabel trueClass = example[1] trueClassName = self.classSet.getName(trueClass) assert (trueClass > 0 ) # multiclass classification uses non-negative integers if "---" in trueClassName: trueClass = set() for name in trueClassName.split("---"): trueClass.add(self.classSet.getId(name)) else: trueClass = [trueClass] # Check prediction for multilabel predictedClasses = prediction[0] if type(predictedClasses) == types.IntType: predictedClasses = [predictedClasses] for predType in predictedClasses: if predType != 1: exTrueClass = 1 if predType in trueClass: exTrueClass = 2 examplesByClass[predType].append( (prediction[predType], exTrueClass, 2)) # positives are negatives for other classes for cls in self.classes: if cls not in predictedClasses: exTrueClass = 1 if cls in trueClass: exTrueClass = 2 examplesByClass[cls].append( (prediction[cls], exTrueClass, 1)) # do the thresholding thresholdByClass = {} for cls in self.classes: if cls == 1: continue thresholdByClass[cls] = 0.0 examplesByClass[cls].sort() # Start with all below zero being negative, and all above it being what is predicted ev = EvaluationData() for example in examplesByClass[cls]: #print example if example[0] < 0.0: updateF(ev, example[1], 2, 1) # always negative else: updateF(ev, example[1], example[2], 1) # what is predicted count = 0 bestF = [self.dataByClass[cls].fscore, None, (0.0, None), None] for example in examplesByClass[cls]: if example[0] < 0.0: # Remove original example updateF(ev, example[1], 2, -1) # Add new example updateF(ev, example[1], example[2], 1) # Calculate F for this point else: # Remove original example updateF(ev, example[1], example[2], -1) # Add new example updateF(ev, example[1], 1, 1) # Calculate F for this point ev.calculateFScore() #print example, ev.toStringConcise() count += 1 #if self.classSet.getName(cls) == "Binding": # print count, example, ev.toStringConcise() if ev.fscore > bestF[0]: bestF = (ev.fscore, count, example, ev.toStringConcise()) self.dataByClass[cls] = copy.copy(ev) print >> sys.stderr, "Threshold", self.classSet.getName(cls), bestF if bestF[2][0] != 0.0: thresholdByClass[cls] = bestF[2][0] + 0.00000001 else: thresholdByClass[cls] = 0.0 #print thresholdByClass self.thresholds = thresholdByClass #self._calculate(examples, predictions, thresholdByClass) #print >> sys.stderr, "Optimal", self.toStringConcise() return thresholdByClass def _calculate(self, examples, predictions, thresholds=None): """ The actual evaluation """ for cls in self.classes: self.dataByClass[cls] = EvaluationData() #self._calculateUntypedUndirected(examples, predictions) # First count instances self.microF = EvaluationData() self.binaryF = EvaluationData() #self.classifications = [] #assert(len(examples) == len(predictions)) #for i in range(len(examples)): # Prepare offsets for thresholding self.thresholds = thresholds offsets = [None] + len(self.classSet.Ids) * [0.0] for cls in self.classSet.Ids.keys(): if thresholds != None and cls in thresholds: offsets[cls] = thresholds[cls] #print self.classes, offsets # Calculate results for example, prediction in itertools.izip(examples, predictions): #self._queueUntypedUndirected(example, prediction) # Check true class for multilabel trueClass = example[1] trueClassName = self.classSet.getName(trueClass) assert (trueClass > 0 ) # multiclass classification uses non-negative integers if "---" in trueClassName: trueClass = set() for name in trueClassName.split("---"): trueClass.add(self.classSet.getId(name)) else: trueClass = [trueClass] # Check prediction for multilabel predictedClasses = prediction[0] if type(predictedClasses) == types.IntType: predictedClasses = [predictedClasses] # Thresholding if thresholds != None: for i in range(2, len(prediction)): if prediction[i] != "N/A": if prediction[ i] < 0.0 and prediction[i] - offsets[i] > 0.0: if predictedClasses == [1]: predictedClasses = [] predictedClasses.append(i) elif prediction[ i] > 0.0 and prediction[i] - offsets[i] < 0.0: predictedClasses.remove(i) if len(predictedClasses) == 0: predictedClasses = [1] for predictedClass in predictedClasses: #print predictedClass assert ( predictedClass > 0 ) # multiclass classification uses non-negative integers if predictedClass in trueClass: # correct classification # correctly classified for its class -> true positive for that class self.dataByClass[predictedClass].addTP() if predictedClass != 1: # a non-negative example -> correct = true positive #self.classifications.append("tp") #self.classifications.append((prediction[0],"tp",self.type,prediction[1],prediction[3])) self.microF.addTP() self.binaryF.addTP() else: # a negative example -> correct = true negative #self.classifications.append((prediction[0],"tn",self.type,prediction[1],prediction[3])) #self.classifications.append("tn") self.microF.addTN() self.binaryF.addTN() for cls in self.classes: # this example was correctly classified for its class, # so it is also correctly classified for each class, # i.e. true negative for them if cls != predictedClass: if cls not in predictedClasses: self.dataByClass[cls].addTN() else: # predictedClass != trueClass: # prediction was incorrect -> false positive for the predicted class self.dataByClass[predictedClass].addFP() if predictedClass == 1: # non-negative example, negative prediction -> incorrect = false negative #self.classifications.append("fn") #self.classifications.append((prediction[0],"fn",self.type,prediction[1],prediction[3])) self.microF.addFN() self.binaryF.addFN() else: # non-negative incorrect prediction -> false positive #self.classifications.append("fp") #self.classifications.append((prediction[0],"fp",self.type,prediction[1],prediction[3])) self.microF.addFP() if 1 in trueClass: self.binaryF.addFP() else: self.binaryF.addTP() for cls in self.classes: if cls in trueClass: # example not found -> false negative if cls not in predictedClasses: self.dataByClass[cls].addFN() elif cls != predictedClass: self.dataByClass[cls].addTN() # Process remaining untyped undirected examples and calculate untyped undirected f-score #self._processUntypedUndirectedQueue() #self.untypedUndirected.calculateFScore() # Then calculate statistics for cls in self.classes: self.dataByClass[cls].calculateFScore() self.microF.calculateFScore() self.binaryF.calculateFScore() # Finally calculate macro-f-score # macro-average is simply the unweighted average of per-class f-scores numClassesWithInstances = 0 self.macroF = EvaluationData() self.macroF.precision = 0.0 self.macroF.recall = 0.0 self.macroF.fscore = 0.0 for cls in self.classes: if (self.dataByClass[cls].getNumInstances() > 0 or self.dataByClass[cls].getFP() > 0 ) and cls != self.classSet.getId("neg", False): numClassesWithInstances += 1 self.macroF.precision += self.dataByClass[cls].precision self.macroF.recall += self.dataByClass[cls].recall if self.dataByClass[cls].fscore != "N/A": self.macroF.fscore += self.dataByClass[cls].fscore if numClassesWithInstances > 0: if self.macroF.precision != 0: self.macroF.precision /= float(numClassesWithInstances) if self.macroF.recall != 0: self.macroF.recall /= float(numClassesWithInstances) if self.macroF.fscore != 0: self.macroF.fscore /= float(numClassesWithInstances) def toStringConcise(self, indent="", title=None): """ Evaluation results in a human readable string format """ if title != None: string = indent + title + "\n" indent += " " string += indent else: string = indent negativeClassId = None for cls in self.classes: if cls != self.classSet.getId("neg", False): tString = "" if self.thresholds != None and cls in self.thresholds: tString = " t:" + str(self.thresholds[cls]) string += self.classSet.getName(cls) string += " " + self.dataByClass[cls].toStringConcise( ) + tString + "\n" + indent else: negativeClassId = cls if negativeClassId != None: cls = negativeClassId string += "(neg " + self.dataByClass[cls].toStringConcise( ) + ")\n" + indent string += "averages:\n" + indent # Micro results string += "micro " + self.microF.toStringConcise() + "\n" + indent # Macro results string += "macro " + self.macroF.prfToString() + "\n" + indent # Binary results string += "untyped " + self.binaryF.toStringConcise() # Untyped undirected results if self.untypedUndirected != None: string += "\n" + indent string += "untyped undirected " + self.untypedUndirected.toStringConcise( ) return string # def __addClassToCSV(self, csvWriter, cls): # values = [] # values.append( self.classSet.getName(cls) ) # values.append( self.truePositivesByClass[cls]+self.falseNegativesByClass[cls] ) # values.append( self.trueNegativesByClass[cls]+self.falsePositivesByClass[cls] ) # values.append(self.truePositivesByClass[cls]) # values.append(self.falsePositivesByClass[cls]) # values.append(self.trueNegativesByClass[cls]) # values.append(self.falseNegativesByClass[cls]) # if self.instancesByClass[cls] > 0 or self.falsePositivesByClass[cls] > 0: # values.append(self.precisionByClass[cls]) # values.append(self.recallByClass[cls]) # values.append(self.fScoreByClass[cls]) # else: # values.extend(["N/A","N/A","N/A"]) # csvWriter.writerow(values) # def toDict(self): """ Evaluation results in a computationally easy to process dictionary format """ dicts = [] if len(self.classes) > 0: assert (not ("1" in self.classSet.getNames() and "neg" in self.classSet.getNames())) negativeClassId = None for cls in self.classes: if cls != self.classSet.getId( "neg", False) and cls != self.classSet.getId("1", False): values = self.dataByClass[cls].toDict() values["class"] = self.classSet.getName(cls) dicts.append(values) else: assert (negativeClassId == None) negativeClassId = cls if negativeClassId != None: values = self.dataByClass[negativeClassId].toDict() values["class"] = "neg" dicts.append(values) dicts.append(self.microF.toDict()) dicts[-1]["class"] = "micro" dicts.append(self.macroF.toDict()) dicts[-1]["class"] = "macro" dicts.append(self.binaryF.toDict()) dicts[-1]["class"] = "untyped" if self.untypedUndirected != None: dicts.append(self.untypedUndirected.toDict()) dicts[-1]["class"] = "untyped undirected" return dicts
class AveragingMultiClassEvaluator(Evaluator): """ An evaluator for multiclass classification results, where an example can belong to one of several classes. For calculating averages over multiple classes, one of the classes, "neg"/1 is considered to be negative while the others are considered to be different types of positive instances. """ type = "multiclass" def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = sorted(classSet.Ids.keys()) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() # hack for unnamed classes if len(self.dataByClass) == 0: self.dataByClass[1] = EvaluationData() self.dataByClass[2] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions) @classmethod def evaluate(cls, examples, predictions, classSet=None, outputFile=None, verbose=True): """ Enables using this class without having to manually instantiate it """ evaluator = cls(examples, predictions, classSet) if verbose: print >> sys.stderr, evaluator.toStringConcise() if outputFile != None: evaluator.saveCSV(outputFile) return evaluator def compare(self, evaluation): if self.microF.fscore > evaluation.microF.fscore: return 1 elif self.microF.fscore == evaluation.microF.fscore: return 0 else: return -1 def getData(self): return self.microF @classmethod def threshold(cls, examples, predictions): # Make negative confidence score / true class pairs if type(examples) in types.StringTypes: examples = ExampleUtils.readExamples(examples, False) if type(predictions) in types.StringTypes: predictions = ExampleUtils.loadPredictions(predictions) pairs = [] realPositives = 0 for example, prediction in itertools.izip(examples, predictions): trueClass = example[1] assert(trueClass > 0) # multiclass classification uses non-negative integers if trueClass > 1: realPositives += 1 negClassValue = prediction[1] pairs.append( (negClassValue, trueClass) ) pairs.sort(reverse=True) realNegatives = len(pairs) - realPositives # When starting thresholding, all examples are considered positive binaryF = EvaluationData() binaryF._tp = realPositives binaryF._fp = realNegatives binaryF._fn = 0 binaryF.calculateFScore() fscore = binaryF.fscore threshold = pairs[0][0]-1. # Turn one example negative at a time for pair in pairs: if pair[1] == 1: # the real class is negative binaryF._fp -= 1 # false positive -> true negative else: # the real class is a positive class binaryF._tp -= 1 # true positive -> ... binaryF._fn += 1 # ... false negative binaryF.calculateFScore() if binaryF.fscore > fscore: fscore = binaryF.fscore threshold = pair[0]+0.00000001 return threshold, fscore # def pool(evaluators): # predictions = [] # for evaluator in evaluators: # assert(isinstance(evaluator,AveragingMultiClassEvaluator)) # predictions.extend(evaluator.predictions) # return AveragingMultiClassEvaluator(predictions, evaluators[0].classSet) # pool = staticmethod(pool) # # def average(evaluators): # averageEvaluator = AveragingMultiClassEvaluator(None, None) # averageEvaluator.microPrecision = 0 # averageEvaluator.microRecall = 0 # averageEvaluator.microFScore = 0 # averageEvaluator.macroPrecision = 0 # averageEvaluator.macroRecall = 0 # averageEvaluator.macroFScore = 0 # averageEvaluator.truePositives = "-" # averageEvaluator.falsePositives = "-" # averageEvaluator.trueNegatives = "-" # averageEvaluator.falseNegatives = "-" # sumWeight = 0.0 # for evaluator in evaluators: # assert(isinstance(evaluator,AveragingMultiClassEvaluator)) # weight = float(len(evaluator.predictions)) # sumWeight += weight # averageEvaluator.macroPrecision += weight * evaluator.macroPrecision # averageEvaluator.macroRecall += weight * evaluator.macroRecall # averageEvaluator.macroFScore += weight * evaluator.macroFScore # averageEvaluator.microPrecision += weight * evaluator.microPrecision # averageEvaluator.microRecall += weight * evaluator.microRecall # averageEvaluator.microFScore += weight * evaluator.microFScore # averageEvaluator.macroPrecision /= sumWeight # averageEvaluator.macroRecall /= sumWeight # averageEvaluator.macroFScore /= sumWeight # averageEvaluator.microPrecision /= sumWeight # averageEvaluator.microRecall /= sumWeight # averageEvaluator.microFScore /= sumWeight # return averageEvaluator # average = staticmethod(average) def _queueUntypedUndirected(self, example, prediction): """ All examples within the same majorId (same sentence) are put in queue. Once major id (sentence) changes, these examples are processed. """ majorId, minorId = example[0].rsplit(".x", 1) if majorId != self.untypedCurrentMajorId: # new sentence self._processUntypedUndirectedQueue() # process queue self.untypedCurrentMajorId = majorId self.untypedPredictionQueue.append( (example, prediction) ) # queue example def _processUntypedUndirectedQueue(self): """ Determines the untyped undirected performance by merging example pairs. This statistic is only meaningful for examples representing directed edges where two consecutive examples are the two directed edges between a pair of nodes. """ prevExample = None prevPrediction = None for example, prediction in self.untypedPredictionQueue: majorId, minorId = example[0].rsplit(".x", 1) if prevExample != None and prevPrediction != None and int(minorId) % 2 != 0: # A positive example in either direction counts as a positive if example[1] != 1 or prevExample[1] != 1: # 1 is the multiclass "neg" class id trueClass = 1 # binary positive class else: trueClass = -1 # binary negative class # A positive prediction in either direction counts as a positive if prediction[0] != 1 or prevPrediction[0] != 1: predictedClass = 1 else: predictedClass = -1 self.untypedUndirected.addInstance(trueClass == 1, predictedClass == 1) prevExample = example prevPrediction = prediction self.untypedPredictionQueue = [] # clear the queue # def _calculateUntypedUndirected(self, examples, predictions): # untypedUndirectedPredictions = [] # predictionsById = {} # for i in range(len(examples)): # id = examples[i][0] # if id != None and id != "": # majorId, minorId = id.rsplit(".x", 1) # if not predictionsById.has_key(majorId): # predictionsById[majorId] = {} # predictionsById[majorId][int(minorId)] = (examples[i], predictions[i]) # for majorId in sorted(predictionsById.keys()): # prevPrediction = None # for minorId in sorted(predictionsById[majorId]): # prediction = predictionsById[majorId][minorId] # if prevPrediction != None and minorId % 2 != 0: # if prediction[0][1] != 1 or prevPrediction[0][1] != 1: # trueClass = 1 # else: # trueClass = -1 # if prediction[1][0] != 1 or prevPrediction[1][0] != 1: # predictedClass = 1 # else: # predictedClass = -1 # untypedUndirectedPredictions.append( ((None,trueClass),predictedClass) ) # prevPrediction = prediction # if len(untypedUndirectedPredictions) > 0: # self.untypedUndirected = BinaryEvaluator(untypedUndirectedPredictions) def _calculate(self, examples, predictions): """ The actual evaluation """ #self._calculateUntypedUndirected(examples, predictions) # First count instances self.microF = EvaluationData() self.binaryF = EvaluationData() #self.classifications = [] #assert(len(examples) == len(predictions)) #for i in range(len(examples)): for example, prediction in itertools.izip(examples, predictions): # self._queueUntypedUndirected(example, prediction) #example = examples[i] # examples and predictions are in matching lists #prediction = predictions[i] # examples and predictions are in matching lists trueClass = example[1] assert(trueClass > 0) # multiclass classification uses non-negative integers predictedClass = prediction[0] #print predictedClass assert(predictedClass > 0) # multiclass classification uses non-negative integers if predictedClass == trueClass: # correct classification # correctly classified for its class -> true positive for that class self.dataByClass[trueClass].addTP() if trueClass != 1: # a non-negative example -> correct = true positive #self.classifications.append("tp") #self.classifications.append((prediction[0],"tp",self.type,prediction[1],prediction[3])) self.microF.addTP() self.binaryF.addTP() else: # a negative example -> correct = true negative #self.classifications.append((prediction[0],"tn",self.type,prediction[1],prediction[3])) #self.classifications.append("tn") self.microF.addTN() self.binaryF.addTN() for cls in self.classes: # this example was correctly classified for its class, # so it is also correctly classified for each class, # i.e. true negative for them if cls != trueClass: self.dataByClass[cls].addTN() else: # predictedClass != trueClass: # prediction was incorrect -> false positive for the predicted class self.dataByClass[predictedClass].addFP() if predictedClass == 1: # non-negative example, negative prediction -> incorrect = false negative #self.classifications.append("fn") #self.classifications.append((prediction[0],"fn",self.type,prediction[1],prediction[3])) self.microF.addFN() self.binaryF.addFN() else: # non-negative incorrect prediction -> false positive #self.classifications.append("fp") #self.classifications.append((prediction[0],"fp",self.type,prediction[1],prediction[3])) self.microF.addFP() if trueClass == 1: self.binaryF.addFP() else: self.binaryF.addTP() for cls in self.classes: if cls == trueClass: # example not found -> false negative self.dataByClass[cls].addFN() elif cls != predictedClass: self.dataByClass[cls].addTN() # Process remaining untyped undirected examples and calculate untyped undirected f-score # self._processUntypedUndirectedQueue() # self.untypedUndirected.calculateFScore() # Then calculate statistics for cls in self.classes: self.dataByClass[cls].calculateFScore() self.microF.calculateFScore() self.binaryF.calculateFScore() # Finally calculate macro-f-score # macro-average is simply the unweighted average of per-class f-scores numClassesWithInstances = 0 self.macroF = EvaluationData() self.macroF.precision = 0.0 self.macroF.recall = 0.0 self.macroF.fscore = 0.0 for cls in self.classes: if (self.dataByClass[cls].getNumInstances() > 0 or self.dataByClass[cls].getFP() > 0) and cls != self.classSet.getId("neg", False): numClassesWithInstances += 1 self.macroF.precision += self.dataByClass[cls].precision self.macroF.recall += self.dataByClass[cls].recall if self.dataByClass[cls].fscore != "N/A": self.macroF.fscore += self.dataByClass[cls].fscore if numClassesWithInstances > 0: if self.macroF.precision != 0: self.macroF.precision /= float(numClassesWithInstances) if self.macroF.recall != 0: self.macroF.recall /= float(numClassesWithInstances) if self.macroF.fscore != 0: self.macroF.fscore /= float(numClassesWithInstances) def toStringConcise(self, indent="", title=None): """ Evaluation results in a human readable string format """ if title != None: string = indent + title + "\n" indent += " " string += indent else: string = indent negativeClassId = None for cls in self.classes: if cls != self.classSet.getId("neg", False): string += self.classSet.getName(cls) string += " " + self.dataByClass[cls].toStringConcise() + "\n" + indent else: negativeClassId = cls if negativeClassId != None: cls = negativeClassId string += "(neg " + self.dataByClass[cls].toStringConcise() + ")\n" + indent string += "averages:\n" + indent # Micro results string += "micro " + self.microF.toStringConcise() + "\n" + indent # Macro results string += "macro " + self.macroF.prfToString() + "\n" + indent # Binary results string += "untyped " + self.binaryF.toStringConcise() # Untyped undirected results if self.untypedUndirected != None: string += "\n" + indent string += "untyped undirected " + self.untypedUndirected.toStringConcise() return string # def __addClassToCSV(self, csvWriter, cls): # values = [] # values.append( self.classSet.getName(cls) ) # values.append( self.truePositivesByClass[cls]+self.falseNegativesByClass[cls] ) # values.append( self.trueNegativesByClass[cls]+self.falsePositivesByClass[cls] ) # values.append(self.truePositivesByClass[cls]) # values.append(self.falsePositivesByClass[cls]) # values.append(self.trueNegativesByClass[cls]) # values.append(self.falseNegativesByClass[cls]) # if self.instancesByClass[cls] > 0 or self.falsePositivesByClass[cls] > 0: # values.append(self.precisionByClass[cls]) # values.append(self.recallByClass[cls]) # values.append(self.fScoreByClass[cls]) # else: # values.extend(["N/A","N/A","N/A"]) # csvWriter.writerow(values) # def toDict(self): """ Evaluation results in a computationally easy to process dictionary format """ dicts = [] if len(self.classes) > 0: assert(not ("1" in self.classSet.getNames() and "neg" in self.classSet.getNames())) negativeClassId = None for cls in self.classes: if cls != self.classSet.getId("neg", False) and cls != self.classSet.getId("1", False): values = self.dataByClass[cls].toDict() values["class"] = self.classSet.getName(cls) dicts.append(values) else: assert(negativeClassId == None) negativeClassId = cls if negativeClassId != None: values = self.dataByClass[negativeClassId].toDict() values["class"] = "neg" dicts.append(values) dicts.append( self.microF.toDict() ) dicts[-1]["class"] = "micro" dicts.append( self.macroF.toDict() ) dicts[-1]["class"] = "macro" dicts.append( self.binaryF.toDict() ) dicts[-1]["class"] = "untyped" if self.untypedUndirected != None: dicts.append(self.untypedUndirected.toDict()) dicts[-1]["class"] = "untyped undirected" return dicts
class MultiLabelEvaluator(Evaluator): """ An evaluator for multiclass classification results, where an example can belong to one of several classes. For calculating averages over multiple classes, one of the classes, "neg"/1 is considered to be negative while the others are considered to be different types of positive instances. """ type = "multiclass" def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = [] for className in sorted(classSet.Ids.keys()): if "---" not in className: classNames.append(className) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = None # EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions) self.thresholds = None @classmethod def evaluate(cls, examples, predictions, classSet=None, outputFile=None): """ Enables using this class without having to manually instantiate it """ evaluator = cls(examples, predictions, classSet) print >> sys.stderr, evaluator.toStringConcise() if outputFile != None: evaluator.saveCSV(outputFile) return evaluator def compare(self, evaluation): if self.microF.fscore > evaluation.microF.fscore: return 1 elif self.microF.fscore == evaluation.microF.fscore: return 0 else: return -1 def getData(self): return self.microF def _queueUntypedUndirected(self, example, prediction): """ All examples within the same majorId (same sentence) are put in queue. Once major id (sentence) changes, these examples are processed. """ majorId, minorId = example[0].rsplit(".x", 1) if majorId != self.untypedCurrentMajorId: # new sentence self._processUntypedUndirectedQueue() # process queue self.untypedCurrentMajorId = majorId self.untypedPredictionQueue.append( (example, prediction) ) # queue example def _processUntypedUndirectedQueue(self): """ Determines the untyped undirected performance by merging example pairs. This statistic is only meaningful for examples representing directed edges where two consecutive examples are the two directed edges between a pair of nodes. """ prevExample = None prevPrediction = None for example, prediction in self.untypedPredictionQueue: majorId, minorId = example[0].rsplit(".x", 1) if prevExample != None and prevPrediction != None and int(minorId) % 2 != 0: # A positive example in either direction counts as a positive if example[1] != 1 or prevExample[1] != 1: # 1 is the multiclass "neg" class id trueClass = 1 # binary positive class else: trueClass = -1 # binary negative class # A positive prediction in either direction counts as a positive if prediction[0] != 1 or prevPrediction[0] != 1: predictedClass = 1 else: predictedClass = -1 self.untypedUndirected.addInstance(trueClass == 1, predictedClass == 1) prevExample = example prevPrediction = prediction self.untypedPredictionQueue = [] # clear the queue def determineThreshold(self, examples, predictions): if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) examplesByClass = {} for cls in self.classes: examplesByClass[cls] = [] # prepare examples for example, prediction in itertools.izip(examples, predictions): # Check true class for multilabel trueClass = example[1] trueClassName = self.classSet.getName(trueClass) assert(trueClass > 0) # multiclass classification uses non-negative integers if "---" in trueClassName: trueClass = set() for name in trueClassName.split("---"): trueClass.add(self.classSet.getId(name)) else: trueClass = [trueClass] # Check prediction for multilabel predictedClasses = prediction[0] if type(predictedClasses) == types.IntType: predictedClasses = [predictedClasses] for predType in predictedClasses: if predType != 1: exTrueClass = 1 if predType in trueClass: exTrueClass = 2 examplesByClass[predType].append( (prediction[predType], exTrueClass, 2) ) # positives are negatives for other classes for cls in self.classes: if cls not in predictedClasses: exTrueClass = 1 if cls in trueClass: exTrueClass = 2 examplesByClass[cls].append( (prediction[cls], exTrueClass, 1) ) # do the thresholding thresholdByClass = {} for cls in self.classes: if cls == 1: continue thresholdByClass[cls] = 0.0 examplesByClass[cls].sort() # Start with all below zero being negative, and all above it being what is predicted ev = EvaluationData() for example in examplesByClass[cls]: #print example if example[0] < 0.0: updateF(ev, example[1], 2, 1) # always negative else: updateF(ev, example[1], example[2], 1) # what is predicted count = 0 bestF = [self.dataByClass[cls].fscore, None, (0.0, None), None] for example in examplesByClass[cls]: if example[0] < 0.0: # Remove original example updateF(ev, example[1], 2, -1) # Add new example updateF(ev, example[1], example[2], 1) # Calculate F for this point else: # Remove original example updateF(ev, example[1], example[2], -1) # Add new example updateF(ev, example[1], 1, 1) # Calculate F for this point ev.calculateFScore() #print example, ev.toStringConcise() count += 1 #if self.classSet.getName(cls) == "Binding": # print count, example, ev.toStringConcise() if ev.fscore > bestF[0]: bestF = (ev.fscore, count, example, ev.toStringConcise()) self.dataByClass[cls] = copy.copy(ev) print >> sys.stderr, "Threshold", self.classSet.getName(cls), bestF if bestF[2][0] != 0.0: thresholdByClass[cls] = bestF[2][0] + 0.00000001 else: thresholdByClass[cls] = 0.0 #print thresholdByClass self.thresholds = thresholdByClass #self._calculate(examples, predictions, thresholdByClass) #print >> sys.stderr, "Optimal", self.toStringConcise() return thresholdByClass def _calculate(self, examples, predictions, thresholds=None): """ The actual evaluation """ for cls in self.classes: self.dataByClass[cls] = EvaluationData() #self._calculateUntypedUndirected(examples, predictions) # First count instances self.microF = EvaluationData() self.binaryF = EvaluationData() #self.classifications = [] #assert(len(examples) == len(predictions)) #for i in range(len(examples)): # Prepare offsets for thresholding self.thresholds = thresholds offsets = [None] + len(self.classSet.Ids) * [0.0] for cls in self.classSet.Ids.keys(): if thresholds != None and cls in thresholds: offsets[cls] = thresholds[cls] #print self.classes, offsets # Calculate results for example, prediction in itertools.izip(examples, predictions): #self._queueUntypedUndirected(example, prediction) # Check true class for multilabel trueClass = example[1] trueClassName = self.classSet.getName(trueClass) assert(trueClass > 0) # multiclass classification uses non-negative integers if "---" in trueClassName: trueClass = set() for name in trueClassName.split("---"): trueClass.add(self.classSet.getId(name)) else: trueClass = [trueClass] # Check prediction for multilabel predictedClasses = prediction[0] if type(predictedClasses) == types.IntType: predictedClasses = [predictedClasses] # Thresholding if thresholds != None: for i in range(2, len(prediction)): if prediction[i] != "N/A": if prediction[i] < 0.0 and prediction[i] - offsets[i] > 0.0: if predictedClasses == [1]: predictedClasses = [] predictedClasses.append(i) elif prediction[i] > 0.0 and prediction[i] - offsets[i] < 0.0: predictedClasses.remove(i) if len(predictedClasses) == 0: predictedClasses = [1] for predictedClass in predictedClasses: #print predictedClass assert(predictedClass > 0) # multiclass classification uses non-negative integers if predictedClass in trueClass: # correct classification # correctly classified for its class -> true positive for that class self.dataByClass[predictedClass].addTP() if predictedClass != 1: # a non-negative example -> correct = true positive #self.classifications.append("tp") #self.classifications.append((prediction[0],"tp",self.type,prediction[1],prediction[3])) self.microF.addTP() self.binaryF.addTP() else: # a negative example -> correct = true negative #self.classifications.append((prediction[0],"tn",self.type,prediction[1],prediction[3])) #self.classifications.append("tn") self.microF.addTN() self.binaryF.addTN() for cls in self.classes: # this example was correctly classified for its class, # so it is also correctly classified for each class, # i.e. true negative for them if cls != predictedClass: if cls not in predictedClasses: self.dataByClass[cls].addTN() else: # predictedClass != trueClass: # prediction was incorrect -> false positive for the predicted class self.dataByClass[predictedClass].addFP() if predictedClass == 1: # non-negative example, negative prediction -> incorrect = false negative #self.classifications.append("fn") #self.classifications.append((prediction[0],"fn",self.type,prediction[1],prediction[3])) self.microF.addFN() self.binaryF.addFN() else: # non-negative incorrect prediction -> false positive #self.classifications.append("fp") #self.classifications.append((prediction[0],"fp",self.type,prediction[1],prediction[3])) self.microF.addFP() if 1 in trueClass: self.binaryF.addFP() else: self.binaryF.addTP() for cls in self.classes: if cls in trueClass: # example not found -> false negative if cls not in predictedClasses: self.dataByClass[cls].addFN() elif cls != predictedClass: self.dataByClass[cls].addTN() # Process remaining untyped undirected examples and calculate untyped undirected f-score #self._processUntypedUndirectedQueue() #self.untypedUndirected.calculateFScore() # Then calculate statistics for cls in self.classes: self.dataByClass[cls].calculateFScore() self.microF.calculateFScore() self.binaryF.calculateFScore() # Finally calculate macro-f-score # macro-average is simply the unweighted average of per-class f-scores numClassesWithInstances = 0 self.macroF = EvaluationData() self.macroF.precision = 0.0 self.macroF.recall = 0.0 self.macroF.fscore = 0.0 for cls in self.classes: if (self.dataByClass[cls].getNumInstances() > 0 or self.dataByClass[cls].getFP() > 0) and cls != self.classSet.getId("neg", False): numClassesWithInstances += 1 self.macroF.precision += self.dataByClass[cls].precision self.macroF.recall += self.dataByClass[cls].recall if self.dataByClass[cls].fscore != "N/A": self.macroF.fscore += self.dataByClass[cls].fscore if numClassesWithInstances > 0: if self.macroF.precision != 0: self.macroF.precision /= float(numClassesWithInstances) if self.macroF.recall != 0: self.macroF.recall /= float(numClassesWithInstances) if self.macroF.fscore != 0: self.macroF.fscore /= float(numClassesWithInstances) def toStringConcise(self, indent="", title=None): """ Evaluation results in a human readable string format """ if title != None: string = indent + title + "\n" indent += " " string += indent else: string = indent negativeClassId = None for cls in self.classes: if cls != self.classSet.getId("neg", False): tString = "" if self.thresholds != None and cls in self.thresholds: tString = " t:" + str(self.thresholds[cls]) string += self.classSet.getName(cls) string += " " + self.dataByClass[cls].toStringConcise() + tString + "\n" + indent else: negativeClassId = cls if negativeClassId != None: cls = negativeClassId string += "(neg " + self.dataByClass[cls].toStringConcise() + ")\n" + indent string += "averages:\n" + indent # Micro results string += "micro " + self.microF.toStringConcise() + "\n" + indent # Macro results string += "macro " + self.macroF.prfToString() + "\n" + indent # Binary results string += "untyped " + self.binaryF.toStringConcise() # Untyped undirected results if self.untypedUndirected != None: string += "\n" + indent string += "untyped undirected " + self.untypedUndirected.toStringConcise() return string # def __addClassToCSV(self, csvWriter, cls): # values = [] # values.append( self.classSet.getName(cls) ) # values.append( self.truePositivesByClass[cls]+self.falseNegativesByClass[cls] ) # values.append( self.trueNegativesByClass[cls]+self.falsePositivesByClass[cls] ) # values.append(self.truePositivesByClass[cls]) # values.append(self.falsePositivesByClass[cls]) # values.append(self.trueNegativesByClass[cls]) # values.append(self.falseNegativesByClass[cls]) # if self.instancesByClass[cls] > 0 or self.falsePositivesByClass[cls] > 0: # values.append(self.precisionByClass[cls]) # values.append(self.recallByClass[cls]) # values.append(self.fScoreByClass[cls]) # else: # values.extend(["N/A","N/A","N/A"]) # csvWriter.writerow(values) # def toDict(self): """ Evaluation results in a computationally easy to process dictionary format """ dicts = [] if len(self.classes) > 0: assert(not ("1" in self.classSet.getNames() and "neg" in self.classSet.getNames())) negativeClassId = None for cls in self.classes: if cls != self.classSet.getId("neg", False) and cls != self.classSet.getId("1", False): values = self.dataByClass[cls].toDict() values["class"] = self.classSet.getName(cls) dicts.append(values) else: assert(negativeClassId == None) negativeClassId = cls if negativeClassId != None: values = self.dataByClass[negativeClassId].toDict() values["class"] = "neg" dicts.append(values) dicts.append( self.microF.toDict() ) dicts[-1]["class"] = "micro" dicts.append( self.macroF.toDict() ) dicts[-1]["class"] = "macro" dicts.append( self.binaryF.toDict() ) dicts[-1]["class"] = "untyped" if self.untypedUndirected != None: dicts.append(self.untypedUndirected.toDict()) dicts[-1]["class"] = "untyped undirected" return dicts