def test_recursive_entity():
    data = anafora.AnaforaData()
    entity = anafora.AnaforaEntity()
    entity.id = "@1@"
    data.annotations.append(entity)
    entity.properties["self"] = entity
    assert entity.is_self_referential()
    assert data.annotations.find_self_referential().id == entity.id

    data = anafora.AnaforaData()
    a = anafora.AnaforaEntity()
    a.id = "A"
    data.annotations.append(a)
    b = anafora.AnaforaEntity()
    b.id = "B"
    data.annotations.append(b)
    c = anafora.AnaforaEntity()
    c.id = "C"
    data.annotations.append(c)
    d = anafora.AnaforaEntity()
    d.id = "D"
    data.annotations.append(d)
    b.properties["x"] = a
    c.properties["y"] = a
    d.properties["1"] = b
    d.properties["2"] = c
    assert not d.is_self_referential()
def test_empty():
    data = anafora.AnaforaData(anafora.ElementTree.fromstring('<data/>'))
    assert list(data.annotations) == []

    data = anafora.AnaforaData(
        anafora.ElementTree.fromstring(
            '<data><annotations></annotations></data>'))
    assert list(data.annotations) == []
def test_missing_ignored_properties():
    reference = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>4</id>
                <type>Z</type>
                <properties>
                    <A>1</A>
                    <B>2</B>
                    <C></C>
                </properties>
            </entity>
        </annotations>
    </data>
    """))
    predicted = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>4</id>
                <type>Z</type>
                <properties>
                    <B>2</B>
                    <A>1</A>
                </properties>
            </entity>
        </annotations>
    </data>
    """))
    named_scores = anafora.evaluate.score_data(reference, predicted)
    scores = named_scores["Z"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1

    # make sure no exceptions are thrown
    anafora.evaluate._print_document_scores([("temp", named_scores)])

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               exclude=[("Z", "C")])
    scores = named_scores["Z"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
Esempio n. 4
0
  def write(self, predictions):
    """Write predictions in anafora XML format"""

    index = 0

    if os.path.isdir(self.out_dir):
      shutil.rmtree(self.out_dir)
    os.mkdir(self.out_dir)

    for sub_dir, text_name, file_names in \
            anafora.walk(self.xml_dir, self.xml_regex):

      xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0])
      ref_data = anafora.AnaforaData.from_file(xml_path)

      data = anafora.AnaforaData()

      for event in ref_data.annotations.select_type('EVENT'):
        entity = anafora.AnaforaEntity()

        entity.id = event.id
        start, end = event.spans[0]
        entity.spans = event.spans
        entity.type = event.type
        entity.properties['DocTimeRel'] = int2label[predictions[index]]

        data.annotations.append(entity)
        index = index + 1

      os.mkdir(os.path.join(self.out_dir, sub_dir))
      out_path = os.path.join(self.out_dir, sub_dir, file_names[0])

      data.indent()
      data.to_file(out_path)
def test_spans():
    data = anafora.AnaforaData(
        anafora.ElementTree.fromstring('''
        <data>
            <annotations>
                <relation>
                    <id>1</id>
                    <type>R1</type>
                    <properties>
                        <relation>2</relation>
                    </properties>
                </relation>
                <relation>
                    <id>2</id>
                    <type>R2</type>
                    <properties>
                        <entity>3</entity>
                    </properties>
                </relation>
                <entity>
                    <id>3</id>
                    <type>E1</type>
                    <span>5,7</span>
                </entity>
            </annotations>
        </data>'''))
    assert data.annotations.select_id("1").spans == ((((5, 7), ), ), )
    assert data.annotations.select_id("2").spans == (((5, 7), ), )
    assert data.annotations.select_id("3").spans == ((5, 7), )
def test_add_entity():
    data = anafora.AnaforaData()
    assert str(data) == '<data />'
    entity = anafora.AnaforaEntity()
    with pytest.raises(ValueError) as exception_info:
        data.annotations.append(entity)
    assert "id" in str(exception_info.value)
    assert str(data) == '<data />'
    entity.id = "1"
    data.annotations.append(entity)
    assert str(
        data
    ) == '<data><annotations><entity><id>1</id></entity></annotations></data>'
    entity.type = "X"
    entity.parents_type = "Y"
    entity.properties["name1"] = "value1"
    assert str(data) == ('<data><annotations><entity>' + '<id>1</id>' +
                         '<type>X</type>' + '<parentsType>Y</parentsType>' +
                         '<properties><name1>value1</name1></properties>' +
                         '</entity></annotations></data>')
    del entity.properties["name1"]
    assert str(data) == ('<data><annotations><entity>' + '<id>1</id>' +
                         '<type>X</type>' + '<parentsType>Y</parentsType>' +
                         '</entity></annotations></data>')
    with pytest.raises(ValueError):
        del entity.properties["name2"]
def prediction_to_anafora(model, labels, features, doc_name="dummy"):
    data = anafora.AnaforaData()
    for sent_labels, sent_features in zip(labels, features):
        # Remove padding and <s> </s>
        special_mask = model.tokenizer.get_special_tokens_mask(
            sent_features.input_ids, already_has_special_tokens=True)
        non_specials = np.count_nonzero(np.array(special_mask) == 0)
        sent_labels = sent_labels[1:non_specials + 1]
        sent_offsets = sent_features.offset_mapping[1:non_specials + 1]

        previous_label = 0
        previous_offset = [None, None]
        for token_label, token_offset in zip(sent_labels, sent_offsets):
            entity_label = model.config.id2label[
                previous_label] if previous_label > 0 else None
            new_word = token_offset[0] != previous_offset[
                1] if model.config.pad_labels else True
            label_diff = token_label - previous_label
            if is_b_label(token_label, label_diff, model.config.bio_mode):
                add_entity(data, doc_name, entity_label, previous_offset)
                previous_label = token_label
                previous_offset = token_offset
            elif is_i_label(token_label, label_diff, new_word,
                            model.config.bio_mode):
                previous_offset[1] = token_offset[1]
            elif previous_label > 0:
                add_entity(data, doc_name, entity_label, previous_offset)
                previous_label = 0
                previous_offset = [None, None]
        if previous_label > 0:
            entity_label = model.config.id2label[previous_label]
            add_entity(data, doc_name, entity_label, previous_offset)

    return data
Esempio n. 8
0
def parse_json(input_dict):
    data = anafora.AnaforaData()
    list_of_entity_index = get_entity_index(input_dict)

    for i in range(len(list_of_entity_index) - 1):
        entity = build_an_entity(input_dict[list_of_entity_index[i]])
        # when an entity has more than one group of span
        if i < len(list_of_entity_index)-2 \
                and input_dict[list_of_entity_index[i]+1]["type"] == "labels"\
                and input_dict[list_of_entity_index[i]+1]["id"].split("@")[0].endswith("_continued"):
            multiple_spans(entity, input_dict[list_of_entity_index[i] + 1])
            entity.properties = set_up_properties(input_dict,
                                                  list_of_entity_index[i] + 2,
                                                  list_of_entity_index[i + 1],
                                                  entity)

        else:
            entity.properties = set_up_properties(input_dict,
                                                  list_of_entity_index[i] + 1,
                                                  list_of_entity_index[i + 1],
                                                  entity)

        data.annotations.append(entity)
    data.indent()
    return data
def test_duplicate_id():
    with pytest.raises(ValueError):
        anafora.AnaforaData(
            anafora.ElementTree.fromstring('''
        <data>
            <annotations>
                <entity><id>1</id></entity>
                <entity><id>1</id></entity>
            </annotations>
        </data>'''))

    data = anafora.AnaforaData()
    entity1 = anafora.AnaforaEntity()
    entity1.id = "1"
    entity2 = anafora.AnaforaEntity()
    entity2.id = "1"
    data.annotations.append(entity1)
    with pytest.raises(ValueError):
        data.annotations.append(entity2)
Esempio n. 10
0
def test_add_entity():
    """Test testing testing"""

    data = anafora.AnaforaData()
    entity = anafora.AnaforaEntity()
    entity.id = '1@e@ID025_path_074@gold'
    data.annotations.append(entity)
    entity.type = 'EVENT'
    entity.parents_type = 'TemporalEntities'
    entity.properties['DocTimeRel'] = 'AFTER'

    data.indent()
    data.to_file('temp.xml')
Esempio n. 11
0
def span2xmlfiles(data_spans, file_name_simple):
    import anafora
    data = anafora.AnaforaData()
    id = 0
    for data_span in data_spans:
        e = anafora.AnaforaEntity()
        e.spans = ((int(data_span[0]), int(data_span[1]) + 1), )
        e.type = data_span[2]
        e.id = str(id) + "@e@" + file_name_simple
        data.annotations.append(e)
        id += 1
    data.indent()
    return data
Esempio n. 12
0
def write_anafora(output_dir, dataset, predictions, tokenizer, config):

    def add_entity(data, doc_name, label, offset):
        entity_label = config.id2label[label] if label > 0 else None
        if entity_label is not None:
            anafora.AnaforaEntity()
            entity = anafora.AnaforaEntity()
            num_entities = len(data.xml.findall("annotations/entity"))
            entity.id = "%s@%s" % (num_entities, doc_name)
            entity.spans = ((offset[0], offset[1]),)
            entity.type = entity_label.replace("B-", "")
            data.annotations.append(entity)

    for doc_index in dataset.doc_indices:
        doc_subdir, doc_start, doc_end = doc_index
        doc_name = os.path.basename(doc_subdir)
        doc_features = dataset.features[doc_start:doc_end]
        doc_predictions = predictions[doc_start:doc_end]
        doc_predictions = np.argmax(doc_predictions, axis=2)
        data = anafora.AnaforaData()
        for sent_labels, sent_features in zip(doc_predictions, doc_features):
            # Remove padding and <s> </s>
            special_mask = tokenizer.get_special_tokens_mask(sent_features.input_ids,
                                                             already_has_special_tokens=True)
            non_specials = np.count_nonzero(np.array(special_mask) == 0)
            sent_labels = sent_labels[1: non_specials + 1]
            sent_offsets = sent_features.offset_mapping[1: non_specials + 1]

            previous_label = 0
            previous_offset = [None, None]  # (start, end)
            for token_label, token_offset in zip(sent_labels, sent_offsets):
                label_diff = token_label - previous_label
                if token_label % 2 != 0:  # If odd number, it is B label
                    add_entity(data, doc_name, previous_label, previous_offset)
                    previous_label = token_label
                    previous_offset = token_offset
                elif label_diff == 1:  # If even number and diff with previous is 1, it is I label
                    previous_offset[1] = token_offset[1]
                elif previous_label > 0:  # If current is O label and previous not O we must write it.
                    add_entity(data, doc_name, previous_label, previous_offset)
                    previous_label = 0
                    previous_offset = [None, None]
            if previous_label > 0:  # If remaining previous not O we must write it.
                entity_label = config.id2label[previous_label]
                add_entity(data, doc_name, entity_label, previous_offset)
        doc_path = os.path.join(output_dir, doc_subdir)
        os.makedirs(doc_path, exist_ok=True)
        doc_path = os.path.join(doc_path,
                                "%s.TimeNorm.system.completed.xml" % doc_name)
        print(doc_path)
        data.to_file(doc_path)
def test_many_groups():
    regex_predictions = {}
    for i in range(1, 1000):
        regex_predictions['a' * i] = ('A' * i, {})
    annotator = anafora.regex.RegexAnnotator(regex_predictions)
    text = "aaaaaaaaaa"
    data = anafora.AnaforaData()
    annotator.annotate(text, data)

    assert len(list(data.annotations)) == 1
    [annotation] = data.annotations
    assert annotation.type == "AAAAAAAAAA"
    assert annotation.spans == ((0, 10), )
    assert dict(annotation.properties.items()) == {}
Esempio n. 14
0
    def write_xml(self, predicted_relations):
        """Write predictions in anafora XML format"""

        # make a directory to write anafora xml
        if os.path.isdir(self.out_dir):
            shutil.rmtree(self.out_dir)
        os.mkdir(self.out_dir)

        # key: note, value: list of rel arg tuples
        note2rels = defaultdict(list)

        # map notes to relations in these notes
        # for container_id, contained_id in predicted_relations:
        for contained_id, container_id in predicted_relations:
            note_name = container_id.split('@')[2]
            note2rels[note_name].append((container_id, contained_id))

        # iterate over reference anafora xml files
        for sub_dir, text_name, file_names in anafora.walk(
                self.xml_dir, self.xml_regex):

            path = os.path.join(self.xml_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)

            # make a new XML file
            generated_data = anafora.AnaforaData()

            # copy gold events and times
            copy_annotations(ref_data, generated_data, 'EVENT')
            copy_annotations(ref_data, generated_data, 'TIMEX3')
            copy_annotations(ref_data, generated_data, 'SECTIONTIME')
            copy_annotations(ref_data, generated_data, 'DOCTIME')

            # add generated relations
            note_name = file_names[0].split('.')[0]
            for container_id, contained_id in note2rels[note_name]:
                relation = anafora.AnaforaRelation()
                relation.id = str(random.random())[2:]
                relation.type = 'TLINK'
                relation.parents_type = 'TemporalRelations'
                relation.properties['Source'] = container_id
                relation.properties['Type'] = 'CONTAINS'
                relation.properties['Target'] = contained_id
                generated_data.annotations.append(relation)

            # write xml to file
            generated_data.indent()
            os.mkdir(os.path.join(self.out_dir, sub_dir))
            out_path = os.path.join(self.out_dir, sub_dir, file_names[0])
            generated_data.to_file(out_path)
Esempio n. 15
0
def _annotate(model_file,
              text_dir,
              output_dir,
              data_dir=None,
              xml_name_regex="[.]xml$",
              text_encoding="utf-8",
              extension=".system.completed.xml"):

    if text_dir is not None:
        iterator = anafora.walk_flat_to_anafora(text_dir)
    elif data_dir is not None:
        iterator = anafora.walk_anafora_to_anafora(data_dir, xml_name_regex)
    else:
        iterator = anafora.walk_anafora_to_anafora(output_dir, xml_name_regex)

    # load a model from the file
    model = RegexAnnotator.from_file(model_file)

    # annotate each text
    for input_sub_dir, output_sub_dir, text_name, xml_names in iterator:
        if data_dir is None:
            data_iter = [(anafora.AnaforaData(), text_name + extension)]
        else:
            data_iter = [(anafora.AnaforaData.from_file(
                os.path.join(data_dir, input_sub_dir, xml_name)),
                          regex.sub(r'[.][^.]*[.][^.]*[.][^.]*[.]xml',
                                    extension, xml_name))
                         for xml_name in xml_names]

        for data, output_name in data_iter:
            # read in the text
            if text_dir is not None:
                text_path = os.path.join(text_dir, text_name)
            elif data_dir is not None:
                text_path = os.path.join(data_dir, input_sub_dir, text_name)
            else:
                text_path = os.path.join(output_dir, input_sub_dir, text_name)
            with codecs.open(text_path, 'r', text_encoding) as text_file:
                text = text_file.read()

            # annotate the text
            model.annotate(text, data)

            # save the annotated data to the output directory
            data_output_dir = os.path.join(output_dir, output_sub_dir)
            if not os.path.exists(data_output_dir):
                os.makedirs(data_output_dir)
            data_output_path = os.path.join(data_output_dir, output_name)
            data.indent()
            data.to_file(data_output_path)
Esempio n. 16
0
    def write_xml(self, prediction_lookup):
        """Write predictions in anafora XML format"""

        # make a directory to write anafora xml
        if os.path.isdir(self.xml_out_dir):
            shutil.rmtree(self.xml_out_dir)
        os.mkdir(self.xml_out_dir)

        # t5 occasionally fails to predict
        missing_predictions = []

        # iterate over reference xml files
        # look up the DTR prediction for each event
        # and write it in anafora format to specificed dir
        for sub_dir, text_name, file_names in \
                anafora.walk(self.xml_ref_dir, xml_regex):

            path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)
            data = anafora.AnaforaData()

            for event in ref_data.annotations.select_type('EVENT'):

                # make a new entity and copy some ref info
                entity = anafora.AnaforaEntity()
                entity.id = event.id
                start, end = event.spans[0]
                entity.spans = event.spans
                entity.type = event.type

                # lookup the prediction
                key = '|'.join((sub_dir, str(start), str(end)))
                if key not in prediction_lookup:
                    # use majority class for now
                    entity.properties['DocTimeRel'] = 'OVERLAP'
                    missing_predictions.append(key)
                else:
                    entity.properties['DocTimeRel'] = prediction_lookup[key]

                data.annotations.append(entity)

            data.indent()
            os.mkdir(os.path.join(self.xml_out_dir, sub_dir))
            out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0])
            data.to_file(out_path)

        print('number of missing predictions:', len(missing_predictions))
def test_filter_by_precision():
    annotator = anafora.regex.RegexAnnotator({
        r'the': ("THE", {}),
        r'\bthe\b': ("THE", {}),
        r'yer\b': ("ER", {}),
        r'er\b': ("ER", {})
    })
    text = "the theater near the record player"
    data = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>1</id>
                <type>THE</type>
                <span>0,3</span><!-- "the" -->
            </entity>
            <entity>
                <id>2</id>
                <type>THE</type>
                <span>17,20</span><!-- "the" -->
            </entity>
            <entity>
                <id>3</id>
                <type>ER</type>
                <span>9,11</span><!-- "er" -->
            </entity>
            <entity>
                <id>4</id>
                <type>ER</type>
                <span>32,34</span><!-- "." -->
            </entity>
        </annotations>
    </data>
    """))
    annotator.prune_by_precision(0.6, [(text, data)])
    assert annotator == anafora.regex.RegexAnnotator({
        r'the': ("THE", {}),
        r'\bthe\b': ("THE", {}),
        r'er\b': ("ER", {})
    })
    annotator.prune_by_precision(1.0, [(text, data)])
    assert annotator == anafora.regex.RegexAnnotator({
        r'\bthe\b': ("THE", {}),
        r'er\b': ("ER", {})
    })
def test_sort():
    data = anafora.AnaforaData(
        anafora.ElementTree.fromstring('''
        <data>
            <annotations>
                <entity>
                    <id>1</id>
                    <type>E</type>
                    <span>5,7</span>
                </entity>
                <entity>
                    <id>2</id>
                    <type>E</type>
                    <span>3,4</span>
                </entity>
            </annotations>
        </data>'''))
    assert [a.id for a in sorted(data.annotations)] == ['2', '1']
Esempio n. 19
0
    def write(self, predictions):
        """Write predictions in anafora XML format"""

        # predictions are in the same order in which they were read
        prediction_lookup = dict(zip(self.offsets, predictions))

        # make a directory to write anafora xml
        if os.path.isdir(self.xml_out_dir):
            shutil.rmtree(self.xml_out_dir)
        os.mkdir(self.xml_out_dir)

        # iterate over reference xml files
        # look up the DTR prediction for each event
        # and write it in anafora format to specificed dir
        for sub_dir, text_name, file_names in \
                anafora.walk(self.xml_ref_dir, xml_regex):

            path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)
            data = anafora.AnaforaData()

            for event in ref_data.annotations.select_type('EVENT'):

                # make a new entity and copy some ref info
                entity = anafora.AnaforaEntity()
                entity.id = event.id
                start, end = event.spans[0]
                entity.spans = event.spans
                entity.type = event.type

                # lookup the prediction
                if (sub_dir, start, end) not in prediction_lookup:
                    print('missing key:', (sub_dir, start, end))
                    continue

                label = prediction_lookup[(sub_dir, start, end)]
                entity.properties['DocTimeRel'] = int2label[label]

                data.annotations.append(entity)

            data.indent()
            os.mkdir(os.path.join(self.xml_out_dir, sub_dir))
            out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0])
            data.to_file(out_path)
def test_add_reference():
    data = anafora.AnaforaData()
    entity1 = anafora.AnaforaEntity()
    entity1.id = "@1@"
    entity2 = anafora.AnaforaEntity()
    entity2.id = "@2@"
    with pytest.raises(ValueError) as exception_info:
        entity2.properties["link"] = entity1
    assert "<annotations" in str(exception_info.value)
    data.annotations.append(entity1)
    with pytest.raises(ValueError):
        entity2.properties["link"] = entity1
    assert "<annotations" in str(exception_info.value)
    data.annotations.append(entity2)
    entity2.properties["link"] = entity1
    assert str(data) == (
        '<data><annotations>' + '<entity><id>@1@</id></entity>' +
        '<entity><id>@2@</id><properties><link>@1@</link></properties></entity>'
        + '</annotations></data>')
def test_regex_annotator():
    annotator = anafora.regex.RegexAnnotator({
        'aa+': ('A', {}),
        'a': ('A', {
            'X': '2'
        }),
        'bb': ('B', {
            'Y': '1'
        })
    })
    text = "bb aaa"
    data = anafora.AnaforaData()
    annotator.annotate(text, data)

    assert len(list(data.annotations)) == 2
    [b_annotation, a_annotation] = data.annotations
    assert b_annotation.type == "B"
    assert b_annotation.spans == ((0, 2), )
    assert dict(b_annotation.properties.items()) == {'Y': '1'}
    assert a_annotation.type == "A"
    assert a_annotation.spans == ((3, 6), )
    assert dict(a_annotation.properties.items()) == {}
def test_preannotated():
    annotator = anafora.regex.RegexAnnotator(
        {
            'aa+': ('A', {
                'X': '2'
            }),
            'a': ('A', {}),
            'bb': ('B', {
                'Y': '1'
            })
        }, {'C': {
            'Z': '3'
        }})
    text = "bb aaa"
    data = anafora.AnaforaData()
    bb = anafora.AnaforaEntity()
    bb.id = "1@preannotated"
    bb.type = "B"
    bb.spans = ((0, 2), )
    data.annotations.append(bb)
    aaa = anafora.AnaforaEntity()
    aaa.id = "2@preannotated"
    aaa.type = "C"
    aaa.spans = ((3, 6), )
    data.annotations.append(aaa)
    annotator.annotate(text, data)

    assert len(list(data.annotations)) == 3
    [b_annotation, c_annotation, a_annotation] = data.annotations
    assert b_annotation.type == "B"
    assert b_annotation.spans == ((0, 2), )
    assert dict(b_annotation.properties.items()) == {'Y': '1'}
    assert c_annotation.type == "C"
    assert c_annotation.spans == ((3, 6), )
    assert dict(c_annotation.properties.items()) == {'Z': '3'}
    assert a_annotation.type == "A"
    assert a_annotation.spans == ((3, 6), )
    assert dict(a_annotation.properties.items()) == {'X': '2'}
def test_remove():
    data = anafora.AnaforaData()
    assert str(data) == '<data />'
    entity1 = anafora.AnaforaEntity()
    entity1.id = "@1@"
    data.annotations.append(entity1)
    entity2 = anafora.AnaforaEntity()
    entity2.id = "@2@"
    entity2.properties["name"] = "value"
    data.annotations.append(entity2)
    assert list(data.annotations) == [entity1, entity2]
    assert str(data) == (
        '<data><annotations>' + '<entity><id>@1@</id></entity>' +
        '<entity><id>@2@</id><properties><name>value</name></properties></entity>'
        + '</annotations></data>')
    data.annotations.remove(entity1)
    assert list(data.annotations) == [entity2]
    assert str(data) == (
        '<data><annotations>' +
        '<entity><id>@2@</id><properties><name>value</name></properties></entity>'
        + '</annotations></data>')
    data.annotations.remove(entity2)
    assert list(data.annotations) == []
    assert str(data) == '<data><annotations /></data>'
Esempio n. 24
0
def span2xmlfiles(exp, target):
    import anafora

    raw_dir_simple = read1.read_from_json('raw_dir_simple')
    for data_id in range(0, 10):
        data_spans = read1.read_json(exp + "\\span_label_all" +
                                     target)[data_id]
        data = anafora.AnaforaData()
        id = 0
        for data_span in data_spans:
            e = anafora.AnaforaEntity()
            e.spans = ((int(data_span[0]), int(data_span[1]) + 1), )
            e.type = data_span[2]
            e.id = str(id) + "@e@" + raw_dir_simple[data_id]
            data.annotations.append(e)
            id += 1
        print data
        data.indent()

        outputfile = exp + "\\" + raw_dir_simple[data_id] + "\\"
        if not os.path.exists(outputfile):
            os.makedirs(outputfile)
        data.to_file(outputfile + raw_dir_simple[data_id] +
                     ".TimeNorm.gold.completed.xml")
Esempio n. 25
0
def test_score_data():
    reference = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>1</id>
                <span>0,5</span>
                <type>X</type>
            </entity>
            <entity>
                <id>2</id>
                <span>5,10</span>
                <type>Y</type>
            </entity>
            <entity>
                <id>3</id>
                <span>15,20</span>
                <type>Y</type>
            </entity>
            <relation>
                <id>4</id>
                <type>Z</type>
                <properties>
                    <Source>1</Source>
                    <Target>2</Target>
                    <Prop1>T</Prop1>
                    <Prop2>A</Prop2>
                </properties>
            </relation>
            <relation>
                <id>5</id>
                <type>Z</type>
                <properties>
                    <Source>2</Source>
                    <Target>3</Target>
                    <Prop1>T</Prop1>
                    <Prop2>B</Prop2>
                </properties>
            </relation>
        </annotations>
    </data>
    """))
    predicted = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>6</id><!-- different -->
                <span>0,5</span>
                <type>X</type>
            </entity>
            <entity>
                <id>7</id><!-- different -->
                <span>5,10</span>
                <type>X</type><!-- different -->
            </entity>
            <entity>
                <id>8</id><!-- different -->
                <span>15,20</span>
                <type>Y</type>
            </entity>
            <relation>
                <id>9</id><!-- different -->
                <type>Z</type>
                <properties>
                    <Source>6</Source>
                    <Target>7</Target>
                    <Prop1>T</Prop1>
                    <Prop2>A</Prop2>
                </properties>
            </relation>
            <relation>
                <id>10</id><!-- different -->
                <type>Z</type>
                <properties>
                    <Source>7</Source>
                    <Target>8</Target>
                    <Prop1>F</Prop1><!-- different -->
                    <Prop2>B</Prop2>
                </properties>
            </relation>
        </annotations>
    </data>
    """))
    named_scores = anafora.evaluate.score_data(reference, predicted)
    assert set(named_scores.keys()) == {
        "*",
        ("*", "<span>"),
        "X",
        ("X", "<span>"),
        "Y",
        ("Y", "<span>"),
        "Z",
        ("Z", "<span>"),
        ("Z", "Source"),
        ("Z", "Target"),
        ("Z", "Prop1"),
        ("Z", "Prop2"),
        ("Z", "Prop1", "T"),
        ("Z", "Prop1", "F"),
        ("Z", "Prop2", "A"),
        ("Z", "Prop2", "B"),
    }
    scores = named_scores["X"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 2
    scores = named_scores["X", "<span>"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 2
    scores = named_scores["Y"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Y", "<span>"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Z"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "<span>"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1", "T"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop1", "F"]
    assert scores.correct == 0
    assert scores.reference == 0
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop2", "A"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2", "B"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["*"]
    assert scores.correct == 1 + 1 + 0
    assert scores.reference == 1 + 2 + 2
    assert scores.predicted == 2 + 1 + 2
    scores = named_scores["*", "<span>"]
    assert scores.correct == 1 + 1 + 2
    assert scores.reference == 1 + 2 + 2
    assert scores.predicted == 2 + 1 + 2

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               exclude=["X", "Y"])
    assert set(named_scores.keys()) == {
        "*",
        ("*", "<span>"),
        "Z",
        ("Z", "<span>"),
        ("Z", "Source"),
        ("Z", "Target"),
        ("Z", "Prop1"),
        ("Z", "Prop2"),
        ("Z", "Prop1", "T"),
        ("Z", "Prop1", "F"),
        ("Z", "Prop2", "A"),
        ("Z", "Prop2", "B"),
    }
    scores = named_scores["Z"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "<span>"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1", "T"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop1", "F"]
    assert scores.correct == 0
    assert scores.reference == 0
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop2", "A"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2", "B"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["*"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["*", "<span>"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               include=[("Z", "Prop1", "T")])
    assert set(named_scores.keys()) == {("Z", "Prop1", "T")}
    scores = named_scores["Z", "Prop1", "T"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               include=[("Z", "Prop1", "F")])
    assert set(named_scores.keys()) == {("Z", "Prop1", "F")}
    scores = named_scores["Z", "Prop1", "F"]
    assert scores.correct == 0
    assert scores.reference == 0
    assert scores.predicted == 1

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               include=["Z"],
                                               exclude=[("Z", "<span>")])
    assert set(named_scores.keys()) == {
        "*",
        "Z",
        ("Z", "Source"),
        ("Z", "Target"),
        ("Z", "Prop1"),
        ("Z", "Prop2"),
        ("Z", "Prop1", "T"),
        ("Z", "Prop1", "F"),
        ("Z", "Prop2", "A"),
        ("Z", "Prop2", "B"),
    }
    scores = named_scores["Z"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1", "T"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop1", "F"]
    assert scores.correct == 0
    assert scores.reference == 0
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop2", "A"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2", "B"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["*"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
Esempio n. 26
0
def test_score_data_overlap():
    # This test is identical to the one above except that the spans have been changed so that they're overlapping
    # instead of being exactly equal
    reference = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>1</id>
                <span>0,5</span>
                <type>X</type>
            </entity>
            <entity>
                <id>2</id>
                <span>5,10</span>
                <type>Y</type>
            </entity>
            <entity>
                <id>3</id>
                <span>15,20</span>
                <type>Y</type>
            </entity>
            <relation>
                <id>4</id>
                <type>Z</type>
                <properties>
                    <Source>1</Source>
                    <Target>2</Target>
                    <Prop1>T</Prop1>
                    <Prop2>A</Prop2>
                </properties>
            </relation>
            <relation>
                <id>5</id>
                <type>Z</type>
                <properties>
                    <Source>2</Source>
                    <Target>3</Target>
                    <Prop1>T</Prop1>
                    <Prop2>B</Prop2>
                </properties>
            </relation>
            <relation>
                <id>6</id>
                <type>Ref</type>
                <properties>
                    <Ref>1</Ref>
                </properties>
            </relation>
            <relation>
                <id>7</id>
                <type>Ref</type>
                <properties>
                    <Ref>6</Ref>
                </properties>
            </relation>
        </annotations>
    </data>
    """))
    predicted = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>6</id><!-- different -->
                <span>0,4</span>
                <type>X</type>
            </entity>
            <entity>
                <id>7</id><!-- different -->
                <span>6,10</span>
                <type>X</type><!-- different -->
            </entity>
            <entity>
                <id>8</id><!-- different -->
                <span>19,20</span>
                <type>Y</type>
            </entity>
            <relation>
                <id>9</id><!-- different -->
                <type>Z</type>
                <properties>
                    <Source>6</Source>
                    <Target>7</Target>
                    <Prop1>T</Prop1>
                    <Prop2>A</Prop2>
                </properties>
            </relation>
            <relation>
                <id>10</id><!-- different -->
                <type>Z</type>
                <properties>
                    <Source>7</Source>
                    <Target>8</Target>
                    <Prop1>F</Prop1><!-- different -->
                    <Prop2>B</Prop2>
                </properties>
            </relation>
        </annotations>
    </data>
    """))
    named_scores = anafora.evaluate.score_data(
        reference,
        predicted,
        annotation_wrapper=anafora.evaluate._OverlappingWrapper)
    assert set(named_scores.keys()) == {
        "*",
        ("*", "<span>"),
        "X",
        ("X", "<span>"),
        "Y",
        ("Y", "<span>"),
        "Z",
        ("Z", "<span>"),
        ("Z", "Source"),
        ("Z", "Target"),
        ("Z", "Prop1"),
        ("Z", "Prop2"),
        ("Z", "Prop1", "T"),
        ("Z", "Prop1", "F"),
        ("Z", "Prop2", "A"),
        ("Z", "Prop2", "B"),
        "Ref",
        ("Ref", "<span>"),
        ("Ref", "Ref"),
    }
    scores = named_scores["X"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 2
    scores = named_scores["X", "<span>"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 2
    scores = named_scores["Y"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Y", "<span>"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Z"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "<span>"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1", "T"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop1", "F"]
    assert scores.correct == 0
    assert scores.reference == 0
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop2", "A"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2", "B"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["*"]
    assert scores.correct == 1 + 1 + 0
    assert scores.reference == 1 + 2 + 2 + 2
    assert scores.predicted == 2 + 1 + 2
    scores = named_scores["*", "<span>"]
    assert scores.correct == 1 + 1 + 2
    assert scores.reference == 1 + 2 + 2 + 1
    assert scores.predicted == 2 + 1 + 2

    named_scores = anafora.evaluate.score_data(
        reference,
        predicted,
        exclude=["X", "Y"],
        annotation_wrapper=anafora.evaluate._OverlappingWrapper)
    assert set(named_scores.keys()) == {
        "*",
        ("*", "<span>"),
        "Z",
        ("Z", "<span>"),
        ("Z", "Source"),
        ("Z", "Target"),
        ("Z", "Prop1"),
        ("Z", "Prop2"),
        ("Z", "Prop1", "T"),
        ("Z", "Prop1", "F"),
        ("Z", "Prop2", "A"),
        ("Z", "Prop2", "B"),
        "Ref",
        ("Ref", "<span>"),
        ("Ref", "Ref"),
    }
    scores = named_scores["Z"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "<span>"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1", "T"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop1", "F"]
    assert scores.correct == 0
    assert scores.reference == 0
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop2", "A"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2", "B"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["*"]
    assert scores.correct == 0
    assert scores.reference == 2 + 2
    assert scores.predicted == 2
    scores = named_scores["*", "<span>"]
    assert scores.correct == 2
    assert scores.reference == 2 + 1
    assert scores.predicted == 2

    named_scores = anafora.evaluate.score_data(
        reference,
        predicted,
        include=[("Z", "Prop1", "T")],
        annotation_wrapper=anafora.evaluate._OverlappingWrapper)
    assert set(named_scores.keys()) == {("Z", "Prop1", "T")}
    scores = named_scores["Z", "Prop1", "T"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1

    named_scores = anafora.evaluate.score_data(
        reference,
        predicted,
        include=[("Z", "Prop1", "F")],
        annotation_wrapper=anafora.evaluate._OverlappingWrapper)
    assert set(named_scores.keys()) == {("Z", "Prop1", "F")}
    scores = named_scores["Z", "Prop1", "F"]
    assert scores.correct == 0
    assert scores.reference == 0
    assert scores.predicted == 1

    named_scores = anafora.evaluate.score_data(
        reference,
        predicted,
        include=["Z"],
        exclude=[("Z", "<span>")],
        annotation_wrapper=anafora.evaluate._OverlappingWrapper)
    assert set(named_scores.keys()) == {
        "*",
        "Z",
        ("Z", "Source"),
        ("Z", "Target"),
        ("Z", "Prop1"),
        ("Z", "Prop2"),
        ("Z", "Prop1", "T"),
        ("Z", "Prop1", "F"),
        ("Z", "Prop2", "A"),
        ("Z", "Prop2", "B"),
    }
    scores = named_scores["Z"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop1", "T"]
    assert scores.correct == 1
    assert scores.reference == 2
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop1", "F"]
    assert scores.correct == 0
    assert scores.reference == 0
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2"]
    assert scores.correct == 2
    assert scores.reference == 2
    assert scores.predicted == 2
    scores = named_scores["Z", "Prop2", "A"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z", "Prop2", "B"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["*"]
    assert scores.correct == 0
    assert scores.reference == 2
    assert scores.predicted == 2
Esempio n. 27
0
def score_dirs(reference_dir,
               predicted_dir,
               xml_name_regex="[.]xml$",
               text_dir=None,
               include=None,
               exclude=None,
               scores_type=Scores,
               annotation_wrapper=None):
    """
    :param string reference_dir: directory containing reference ("gold standard") Anafora XML directories
    :param string predicted_dir: directory containing predicted (system-generated) Anafora XML directories
    :param xml_name_regex: regular expression matching the files to be compared
    :param string text_dir: directory containing the raw texts corresponding to the Anafora XML
        (if None, texts are assumed to be in the reference dir)
    :param set include: types of annotations to include (others will be excluded); may be type names,
        (type-name, property-name) tuples, (type-name, property-name, property-value) tuples
    :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples,
        (type-name, property-name, property-value) tuples
    :param type scores_type: type for calculating matches between predictions and reference
    :param type annotation_wrapper: wrapper object to apply to AnaforaAnnotations
    :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from
        (annotation type[, property name[, property value]]) to a Scores object
    """

    # walks through the reference Anafora XML directories, scoring each and adding those to the overall scores
    for sub_dir, text_name, reference_xml_names in anafora.walk(
            reference_dir, xml_name_regex):

        # load the reference data from its Anafora XML
        try:
            [reference_xml_name] = reference_xml_names
        except ValueError:
            logging.warn("expected one reference file for %s, found %s",
                         text_name, reference_xml_names)
            if not reference_xml_names:
                continue
            reference_xml_name = reference_xml_names[0]
        reference_xml_path = os.path.join(reference_dir, sub_dir,
                                          reference_xml_name)
        reference_data = _load(reference_xml_path)

        # check for self-references in the annotations, which cause equality and hashing to fail
        self_reference = reference_data.annotations.find_self_referential()
        if self_reference is not None:
            msg = "skipping reference file %s with self-referential annotation %s"
            logging.warn(msg, reference_xml_path, self_reference.id)
            continue

        # find and load the corresponding predicted data from its Anafora XML
        predicted_xml_glob = os.path.join(predicted_dir, sub_dir,
                                          text_name + "*.xml")
        predicted_xml_paths = [
            f for f in glob.glob(predicted_xml_glob)
            if re.search(xml_name_regex, f) is not None
        ]
        try:
            [predicted_xml_path] = predicted_xml_paths
            predicted_data = _load(predicted_xml_path)
        except ValueError:
            logging.warn("expected one predicted file at %s, found %s",
                         predicted_xml_glob, predicted_xml_paths)
            if not predicted_xml_paths:
                predicted_xml_path = None
                predicted_data = anafora.AnaforaData()
            else:
                predicted_xml_path = predicted_xml_paths[0]
                predicted_data = _load(predicted_xml_path)

        # check for self-references in the annotations, which cause equality and hashing to fail
        self_reference = predicted_data.annotations.find_self_referential()
        if self_reference is not None:
            msg = "skipping predicted file %s with self-referential annotation %s"
            logging.warn(msg, predicted_xml_path, self_reference.id)
            predicted_data = anafora.AnaforaData()

        # determine the path for the raw text source file
        if text_dir is None:
            text_path = os.path.join(reference_dir, sub_dir, text_name)
        else:
            text_path = os.path.join(text_dir, text_name)

        # if no raw text was found, then asking for the text of an annotation is an error
        if not os.path.exists(text_path) or not os.path.isfile(text_path):

            def _span_text(_):
                raise RuntimeError(
                    "no text file found at {0}".format(text_path))

        # otherwise, the text of an annotation can be extracted based on its spans
        else:
            with open(text_path) as text_file:
                text = text_file.read()

            def _flatten(items):
                if isinstance(items, tuple) and isinstance(items[0], int):
                    yield items
                else:
                    for item in items:
                        for flattened_items in _flatten(item):
                            yield flattened_items

            def _span_text(spans):
                return "...".join(text[start:end]
                                  for start, end in _flatten(spans))

        # score this data and update the overall scores
        named_scores = score_data(reference_data,
                                  predicted_data,
                                  include,
                                  exclude,
                                  scores_type=scores_type,
                                  annotation_wrapper=annotation_wrapper)
        for name, scores in named_scores.items():

            # if there were some predictions, and if we're using scores that keep track of errors, log the errors
            if predicted_xml_paths:
                for annotation, message in getattr(scores, "errors", []):
                    logging.debug('%s: %s: "%s" %s"', text_name, message,
                                  _span_text(annotation.spans), annotation)

        # generate the file name and the resulting scores
        yield text_name, named_scores
def test_schema_validate():
    schema = anafora.validate.Schema(anafora.ElementTree.fromstring("""
        <schema>
        <defaultattribute>
            <required>True</required>
        </defaultattribute>
        <definition>
            <entities>
                <entity type="X">
                        <properties>
                                <property type="A" input="choice">x,y</property>
                                <property type="B" />
                                <property type="C" instanceOf="Y,Z" />
                        </properties>
                </entity>
                <entity type="Y" />
                <entity type="Z" />
            </entities>
            <relations>
                <relation type="R">
                    <properties>
                        <property type="D" instanceOf="X" required="False" />
                        <property type="E" instanceOf="Y,Z" required="False" />
                    </properties>
                </relation>
            </relations>
        </definition>
        </schema>
        """))
    data = anafora.AnaforaData()
    entity1 = anafora.AnaforaEntity()
    entity1.id = "@1@"
    entity1.type = "X"
    entity1.properties["A"] = "x"
    data.annotations.append(entity1)
    assert schema.errors(data)
    entity1.properties["B"] = "y"
    assert schema.errors(data)
    entity1.properties["C"] = "z"
    assert schema.errors(data)
    entity2 = anafora.AnaforaEntity()
    entity2.id = "@2@"
    entity2.type = "X"
    data.annotations.append(entity2)
    entity1.properties["C"] = entity2
    assert schema.errors(data)
    entity2.type = "Y"
    assert not schema.errors(data)
    entity1.properties["A"] = "y"
    assert not schema.errors(data)
    entity1.properties["A"] = "z"
    assert schema.errors(data)
    entity1.properties["A"] = "x"
    assert not schema.errors(data)

    relation = anafora.AnaforaRelation()
    relation.id = "@3@"
    relation.type = ""
    data.annotations.append(relation)
    assert schema.errors(data)
    relation.type = "R"
    assert not schema.errors(data)
    relation.properties["D"] = entity1
    assert not schema.errors(data)
    relation.properties["E"] = entity1
    assert schema.errors(data)
    relation.properties["E"] = entity2
    assert not schema.errors(data)
    relation.properties["X"] = "Y"
    assert schema.errors(data)
Esempio n. 29
0
def to_anafora_data(timeml_path):
    """
    :param xml.etree.ElementTree.Element timeml_path: path of the TimeML XML
    :return anafora.AnaforaData: an Anafora version of the TimeML annotations
    """
    entity_tags = {"TIMEX3", "EVENT", "SIGNAL"}
    tag_id_attrs = {
        "TIMEX3": "tid",
        "EVENT": "eid",
        "SIGNAL": "sid",
        "MAKEINSTANCE": "eiid",
        "TLINK": "lid",
        "SLINK": "lid",
        "ALINK": "lid",
    }
    ref_id_attrs = {
        "eventID", "signalID", "beginPoint", "endPoint", "valueFromFunction",
        "anchorTimeID", "eventInstanceID", "timeID", "signalID",
        "relatedToEventInstance", "relatedToTime", "subordinatedEventInstance",
        "tagID"
    }
    text = to_text(timeml_path)
    data = anafora.AnaforaData()
    root = anafora.ElementTree.parse(timeml_path).getroot()

    prefix_to_char = {'t': 'e', 'e': 'e', 's': 'e', 'ei': 'r', 'l': 'r'}
    timeml_id_to_anafora_id = {}
    count = 1
    file_base, _ = os.path.splitext(os.path.basename(timeml_path))
    for elem in root.iter():
        if elem.tag in tag_id_attrs:
            timeml_id = elem.attrib[tag_id_attrs[elem.tag]]
            [(prefix, number)] = re.findall(r'^(\D+)(\d+)$', timeml_id)
            timeml_id_to_anafora_id[timeml_id] = '{0:d}@{1}@{2}@gold'.format(
                count, prefix_to_char[prefix], file_base)
            count += 1

    def add_annotations_from(elem, offset=0):
        start = offset
        annotation = None
        if elem.tag in tag_id_attrs:
            annotation = anafora.AnaforaEntity(
            ) if elem.tag in entity_tags else anafora.AnaforaRelation()
            id_attr = tag_id_attrs[elem.tag]
            annotation.id = timeml_id_to_anafora_id[elem.attrib[id_attr]]
            annotation.type = elem.tag
            if isinstance(annotation, anafora.AnaforaEntity):
                annotation.spans = ((start, start), )
            for name, value in elem.attrib.items():
                if name != id_attr:
                    if name in ref_id_attrs:
                        value = timeml_id_to_anafora_id[value]
                    annotation.properties[name] = value
            data.annotations.append(annotation)

        if elem.text is not None:
            offset += len(elem.text)
        for child in elem:
            offset = add_annotations_from(child, offset)

        if annotation is not None and isinstance(annotation,
                                                 anafora.AnaforaEntity):
            annotation.spans = ((start, offset), )
            if elem.text != text[start:offset]:
                raise ValueError('{0}: "{1}" != "{2}"'.format(
                    timeml_path, elem.text, text[start:offset]))

        if elem.tail is not None:
            offset += len(elem.tail)
        return offset

    add_annotations_from(root)
    return data
def test_delete_excluded():
    reference = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>1@e</id>
                <type>Z</type>
                <span>1, 3</span>
                <properties>
                    <A>2@e</A>
                </properties>
            </entity>
            <entity>
                <id>2@e</id>
                <type>Y</type>
                <span>4, 6</span>
                <properties>
                    <B>3@e</B>
                </properties>
            </entity>
            <entity>
                <id>3@e</id>
                <type>X</type>
                <span>7, 9</span>
            </entity>
            <entity>
                <id>4@e</id>
                <type>W</type>
                <span>20, 30</span>
                <properties>
                    <B>3@e</B>
                </properties>
            </entity>
        </annotations>
    </data>
    """))
    predicted = anafora.AnaforaData(
        anafora.ElementTree.fromstring("""
    <data>
        <annotations>
            <entity>
                <id>4@e</id>
                <type>Z</type>
                <span>1, 3</span>
                <properties>
                    <A>5@e</A>
                </properties>
            </entity>
            <entity>
                <id>5@e</id>
                <type>Y</type>
                <span>4, 6</span>
                <properties>
                    <B>6@e</B>
                </properties>
            </entity>
            <entity>
                <id>6@e</id>
                <type>X</type>
                <span>10, 15</span>
            </entity>
            <entity>
                <id>7@e</id>
                <type>W</type>
                <span>20, 30</span>
                <properties>
                    <B></B>
                </properties>
            </entity>
        </annotations>
    </data>
    """))
    named_scores = anafora.evaluate.score_data(reference, predicted)
    scores = named_scores["X"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Y"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["W"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               exclude={"X"})
    scores = named_scores["Y"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["W"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               exclude={"Y"})
    scores = named_scores["X"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               exclude={"Z"})
    scores = named_scores["X"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Y"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1

    named_scores = anafora.evaluate.score_data(reference,
                                               predicted,
                                               include={("*", "<span>")})
    scores = named_scores["X", "<span>"]
    assert scores.correct == 0
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Y", "<span>"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["Z", "<span>"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1
    scores = named_scores["W", "<span>"]
    assert scores.correct == 1
    assert scores.reference == 1
    assert scores.predicted == 1