Example #1
0
 def test_ann_conversions(self):
     """Tests converting and un-converting a valid Annotations object to an ANN file."""
     annotations = Annotations(self.ann_file_path_one, annotation_type='ann')
     annotations.to_ann(write_location=join(self.test_dir,"intermediary.ann"))
     annotations2 = Annotations(join(self.test_dir, "intermediary.ann"), annotation_type='ann')
     self.assertEqual(annotations.get_entity_annotations(return_dictionary=True),
                      annotations2.get_entity_annotations(return_dictionary=True)
                      )
Example #2
0
 def test_intersection(self):
     annotations1 = Annotations(join(self.dataset.get_data_directory(),
                                     self.ann_files[0]),
                                annotation_type='ann')
     annotations2 = Annotations(join(self.dataset.get_data_directory(),
                                     self.ann_files[1]),
                                annotation_type='ann')
     annotations1.add_entity(*annotations2.get_entity_annotations()[0])
     annotations1.add_entity(*annotations2.get_entity_annotations()[1])
     self.assertEqual(
         annotations1.intersection(annotations2),
         set([
             annotations2.get_entity_annotations()[0],
             annotations2.get_entity_annotations()[1]
         ]))
Example #3
0
 def test_get_entity_annotations_dict(self):
     """
     Tests the validity of the annotation dict
     :return:
     """
     annotations = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann')
     self.assertIsInstance(annotations.get_entity_annotations(return_dictionary=True), dict)
Example #4
0
 def test_init_from_ann_file(self):
     """
     Tests initialization from valid ann file
     :return:
     """
     annotations = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann')
     self.assertIsNotNone(annotations.get_entity_annotations())
Example #5
0
    def test_confusion_matrix(self):
        annotations1 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann')
        annotations2 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[1]), annotation_type='ann')
        annotations1.add_entity(*annotations2.get_entity_annotations()[0])

        self.assertEqual(len(annotations1.compute_confusion_matrix(annotations2, self.entities)[0]), len(self.entities))
        self.assertEqual(len(annotations1.compute_confusion_matrix(annotations2, self.entities)), len(self.entities))
Example #6
0
 def test_get_entity_annotations_list(self):
     """
     Tests the validity of annotation list
     :return:
     """
     annotations = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann')
     self.assertIsInstance(annotations.get_entity_annotations(), list)
Example #7
0
    def test_good_con_data(self):
        """Tests to see if valid con data can be used to instantiate an Annotations object."""
        with open(join(self.test_dir, "test_con.con"), 'w+') as c,\
                open(join(self.test_dir, "test_con_text.txt"), 'w+') as t:
            c.write(con_text)
            t.write(con_source_text)

            annotations = Annotations(c.name, annotation_type='con', source_text_path=t.name)
            self.assertIsInstance(annotations.get_entity_annotations(), list)
Example #8
0
 def test_compute_ambiguity(self):
     annotations1 = Annotations(join(self.dataset.get_data_directory(),
                                     self.ann_files[0]),
                                annotation_type='ann')
     annotations2 = Annotations(join(self.dataset.get_data_directory(),
                                     self.ann_files[0]),
                                annotation_type='ann')
     label, start, end, text = annotations2.get_entity_annotations()[0]
     annotations2.add_entity('incorrect_label', start, end, text)
     self.assertEqual(len(annotations1.compute_ambiguity(annotations2)), 1)
Example #9
0
    def get_training_data(self, data_format='spacy'):
        """
        Get training data in a specified format.

        :param data_format: The specified format as a string.

        :return: The requested data in the requested format.
        """
        # Only spaCy format is currently supported.
        if data_format != 'spacy':
            raise TypeError("Format %s not supported" % format)

        training_data = []

        # Add each entry in dataset with annotation to train_data
        for data_file in self.all_data_files:
            txt_path = data_file.get_text_path()
            ann_path = data_file.get_annotation_path()
            annotations = Annotations(ann_path, source_text_path=txt_path)
            training_data.append(
                annotations.get_entity_annotations(format='spacy'))

        return training_data
    def __call__(self, doc):
        nlp = self.nlp

        if hasattr(doc._, 'file_name'):
            logging.debug("%s: Called GoldAnnotator Component",
                          doc._.file_name)

        if logging.getLogger().getEffectiveLevel(
        ) == logging.DEBUG:  #print document tokenization
            for token in doc:
                logging.debug(str(token))

        #check if gold annotation file path has been set.
        if not hasattr(doc._, 'gold_annotation_file'):
            raise ValueError(
                "No extension doc._.gold_annotation_file is present.")

        gold_annotations = Annotations(doc._.gold_annotation_file,
                                       annotation_type='ann')

        # for label in set([label for _,_,label in [gold['entities'][key] for key in gold['entities']]]):

        # for token in doc:
        #     print(token.text, token.idx)
        for e_label, e_start, e_end, _ in gold_annotations.get_entity_annotations(
        ):
            #print(e_label, e_start, e_end)
            if e_start > e_end:
                logging.critical(
                    "%s: Broken annotation - start is greater than end: (%i,%i,%s)",
                    doc._.file_name, e_start, e_end, e_label)
                continue
            span = doc.char_span(e_start, e_end)

            if span is None:
                self.failed_overlay_count += 1
                self.failed_identifying_span_count += 1
                logging.warning(
                    "%s: Number of failed annotation overlays with current tokenizer: %i (%i,%i,%s)",
                    doc._.file_name, self.failed_overlay_count, e_start, e_end,
                    e_label)
            fixed_span = self.find_span(e_start, e_end, e_label, span, doc)
            if fixed_span is not None:
                if span is None:
                    logging.warning("%s: Fixed span (%i,%i,%s) into: %s",
                                    doc._.file_name, e_start, e_end, e_label,
                                    fixed_span.text)
                    self.failed_identifying_span_count -= 1
                for token in fixed_span:
                    if e_label in self.labels or not self.labels:
                        token._.set('gold_label', e_label)

            else:  #annotation was not able to be fixed, it will be ignored - this is bad in evaluation.
                logging.warning("%s: Could not fix annotation: (%i,%i,%s)",
                                doc._.file_name, e_start, e_end, e_label)
                logging.warning("%s: Total Failed Annotations: %i",
                                doc._.file_name,
                                self.failed_identifying_span_count)

        if self.failed_overlay_count > .3 * len(
                gold_annotations.get_entity_annotations()):
            logging.warning(
                "%s: Annotations may mis-aligned as more than 30 percent failed to overlay: %s",
                doc._.file_name, doc._.gold_annotation_file)

        return doc
Example #11
0
 def test_init_from_ann_file(self):
     """Tests initialization from valid ann file"""
     annotations = Annotations(self.ann_file_path_one, annotation_type='ann')
     self.assertIsNotNone(annotations.get_entity_annotations())
Example #12
0
 def test_get_entity_annotations_list(self):
     """Tests the validity of annotation list"""
     annotations = Annotations(self.ann_file_path_one, annotation_type='ann')
     self.assertIsInstance(annotations.get_entity_annotations(), list)
Example #13
0
 def test_get_entity_annotations_dict(self):
     """Tests the validity of the annotation dict."""
     annotations = Annotations(self.ann_file_path_one, annotation_type='ann')
     self.assertIsInstance(annotations.get_entity_annotations(return_dictionary=True), dict)