Example #1
0
 def test_confusion_matrix(self):
     ann_1 = Annotations(self.ann_path_1)
     ann_2 = Annotations(self.ann_path_2)
     ann_1.add_entity(*ann_2.annotations[0])
     self.assertEqual(
         len(ann_1.compute_confusion_matrix(ann_2, self.entities)[0]),
         len(self.entities))
     self.assertEqual(
         len(ann_1.compute_confusion_matrix(ann_2, self.entities)),
         len(self.entities))
Example #2
0
    def compute_confusion_matrix(self, other, leniency=0):
        """
        Generates a confusion matrix where this Dataset serves as the gold standard annotations and `dataset` serves
        as the predicted annotations. A typical workflow would involve creating a Dataset object with the prediction directory
        outputted by a model and then passing it into this method.

        :param other: a Dataset object containing a predicted version of this dataset.
        :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side.
        :return: two element tuple containing a label array (of entity names) and a matrix where rows are gold labels and columns are predicted labels. matrix[i][j] indicates that entities[i] in this dataset was predicted as entities[j] in 'annotation' matrix[i][j] times
        """
        if not isinstance(other, Dataset):
            raise ValueError("other must be instance of Dataset")

        # verify files are consistent
        diff = {d.file_name for d in self} - {d.file_name for d in other}
        if diff:
            raise ValueError(
                f"Dataset of predictions is missing the files: {repr(diff)}")

        # sort entities in ascending order by count.
        entities = [
            key for key, _ in sorted(self.compute_counts().items(),
                                     key=lambda x: x[1])
        ]
        confusion_matrix = [[0 * len(entities)] * len(entities)]

        for gold_data_file in self:
            prediction_iter = iter(other)
            prediction_data_file = next(prediction_iter)
            while str(gold_data_file) != str(prediction_data_file):
                prediction_data_file = next(prediction_iter)

            gold_annotation = Annotations(gold_data_file.ann_path)
            pred_annotation = Annotations(prediction_data_file.ann_path)

            # compute matrix on the Annotation file level
            ann_confusion_matrix = gold_annotation.compute_confusion_matrix(
                pred_annotation, entities, leniency=leniency)
            for i in range(len(confusion_matrix)):
                for j in range(len(confusion_matrix)):
                    confusion_matrix[i][j] += ann_confusion_matrix[i][j]

        return entities, confusion_matrix