def __call__(self, doc):
        """
        Overlays entity annotations over tokens in a Doc object. Requires that tokens in the Doc have the custom
        'gold_annotation_file' and 'file_name' extension.
        :param doc: a spaCy Doc object.
        :return: the same Doc object, but it now has 'gold_label' annotations.
        """

        if hasattr(doc._, 'file_name'):
            logging.debug("%s: Called GoldAnnotator Component", doc._.file_name)

        if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
            # print document tokenization
            for token in doc:
                logging.debug(str(token))

        # check if gold annotation file path has been set.
        if not hasattr(doc._, 'gold_annotation_file'):
            logging.warning("No extension doc._.gold_annotation_file is present; it will not be possible to fit a model with this Doc")
            return doc

        gold_annotations = Annotations(doc._.gold_annotation_file)

        for e_label, e_start, e_end, _ in gold_annotations.get_entity_annotations():
            if e_start > e_end:
                logging.critical("%s: Broken annotation - start is greater than end: (%i,%i,%s)",
                                 doc._.file_name, e_start, e_end, e_label)
                continue
            span = doc.char_span(e_start, e_end)

            if span is None:
                self.failed_overlay_count += 1
                self.failed_identifying_span_count += 1
                logging.warning("%s: Number of failed annotation overlays with current tokenizer: %i (%i,%i,%s)",
                                doc._.file_name, self.failed_overlay_count, e_start, e_end, e_label)

            fixed_span = self.find_span(e_start, e_end, doc)
            if fixed_span is not None:
                if span is None:
                    logging.warning("%s: Fixed span (%i,%i,%s) into: %s",
                                    doc._.file_name, e_start, e_end, e_label, fixed_span.text)
                    self.failed_identifying_span_count -= 1
                for token in fixed_span:
                    if e_label in self.labels or not self.labels:
                        token._.set('gold_label', e_label)

            else:  # annotation was not able to be fixed, it will be ignored - this is bad in evaluation.
                logging.warning("%s: Could not fix annotation: (%i,%i,%s)", doc._.file_name, e_start, e_end, e_label)
                logging.warning("%s: Total Failed Annotations: %i", doc._.file_name, self.failed_identifying_span_count)

        if self.failed_overlay_count > .3 * len(gold_annotations):
            logging.warning("%s: Annotations may mis-aligned as more than 30 percent failed to overlay: %s",
                            doc._.file_name, doc._.gold_annotation_file)

        return doc
Ejemplo n.º 2
0
 def test_confusion_matrix(self):
     ann_1 = Annotations(self.ann_path_1)
     ann_2 = Annotations(self.ann_path_2)
     ann_1.add_entity(*ann_2.get_entity_annotations()[0])
     self.assertEqual(len(ann_1.compute_confusion_matrix(ann_2, self.entities)[0]), len(self.entities))
     self.assertEqual(len(ann_1.compute_confusion_matrix(ann_2, self.entities)), len(self.entities))