Beispiel #1
0
    def add_annotations_from(elem, offset=0):
        start = offset
        annotation = None
        if elem.tag in tag_id_attrs:
            annotation = anafora.AnaforaEntity(
            ) if elem.tag in entity_tags else anafora.AnaforaRelation()
            id_attr = tag_id_attrs[elem.tag]
            annotation.id = timeml_id_to_anafora_id[elem.attrib[id_attr]]
            annotation.type = elem.tag
            if isinstance(annotation, anafora.AnaforaEntity):
                annotation.spans = ((start, start), )
            for name, value in elem.attrib.items():
                if name != id_attr:
                    if name in ref_id_attrs:
                        value = timeml_id_to_anafora_id[value]
                    annotation.properties[name] = value
            data.annotations.append(annotation)

        if elem.text is not None:
            offset += len(elem.text)
        for child in elem:
            offset = add_annotations_from(child, offset)

        if annotation is not None and isinstance(annotation,
                                                 anafora.AnaforaEntity):
            annotation.spans = ((start, offset), )
            if elem.text != text[start:offset]:
                raise ValueError('{0}: "{1}" != "{2}"'.format(
                    timeml_path, elem.text, text[start:offset]))

        if elem.tail is not None:
            offset += len(elem.tail)
        return offset
Beispiel #2
0
def add_relations_to_closest(data,
                             source_type,
                             target_type,
                             relation_type,
                             relation_source_property_name,
                             relation_target_property_name,
                             relation_other_properties=None):
    """
    Adds a relation from each `source_type` annotation to the closest `target_type` annotation.

    :param anafora.AnaforaData data: the Anafora data where relations should be added
    :param str source_type: the type of the source annotations
    :param str target_type: the type of the target annotations
    :param str relation_type: the type of relation annotation to be created
    :param str relation_source_property_name: the name of the property on the relation annotation that should point to
        the source annotation
    :param str relation_target_property_name: the name of the property on the relation annotation that should point to
        the target annotation
    :param list relation_other_properties: a list of (name, value) tuples of other properties that should be set on the
        relation annotations that are created
    """

    # map the id of each source/target annotation to its character offsets
    points = {}
    for source_entity in data.annotations.select_type(source_type):
        points[source_entity.id] = list(_flatten_to_ints(source_entity.spans))
    for target_entity in data.annotations.select_type(target_type):
        points[target_entity.id] = list(_flatten_to_ints(target_entity.spans))

    # add a relation for each source entity
    target_entities = list(data.annotations.select_type(target_type))
    if target_entities:
        source_entities = list(data.annotations.select_type(source_type))
        for source_entity in source_entities:

            # distance to an annotation is the minimum distance to any one of its character offsets
            def distance_to_source_entity(entity):
                return min(
                    abs(p1 - p2) for p1 in points[source_entity.id]
                    for p2 in points[entity.id])

            # find the target entity that is closest to the source entity
            target_entity = min(target_entities, key=distance_to_source_entity)

            # create a relation annotation per the various arguments to this function
            relation = anafora.AnaforaRelation()
            relation.id = "{0}@{1}@{2}".format(source_entity.id, relation_type,
                                               target_entity.id)
            data.annotations.append(relation)
            relation.type = relation_type
            relation.properties[relation_source_property_name] = source_entity
            relation.properties[relation_target_property_name] = target_entity
            if relation_other_properties is not None:
                for name, value in relation_other_properties:
                    relation.properties[name] = value

    data.indent()
Beispiel #3
0
    def write_xml(self, predicted_relations):
        """Write predictions in anafora XML format"""

        # make a directory to write anafora xml
        if os.path.isdir(self.out_dir):
            shutil.rmtree(self.out_dir)
        os.mkdir(self.out_dir)

        # key: note, value: list of rel arg tuples
        note2rels = defaultdict(list)

        # map notes to relations in these notes
        # for container_id, contained_id in predicted_relations:
        for contained_id, container_id in predicted_relations:
            note_name = container_id.split('@')[2]
            note2rels[note_name].append((container_id, contained_id))

        # iterate over reference anafora xml files
        for sub_dir, text_name, file_names in anafora.walk(
                self.xml_dir, self.xml_regex):

            path = os.path.join(self.xml_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)

            # make a new XML file
            generated_data = anafora.AnaforaData()

            # copy gold events and times
            copy_annotations(ref_data, generated_data, 'EVENT')
            copy_annotations(ref_data, generated_data, 'TIMEX3')
            copy_annotations(ref_data, generated_data, 'SECTIONTIME')
            copy_annotations(ref_data, generated_data, 'DOCTIME')

            # add generated relations
            note_name = file_names[0].split('.')[0]
            for container_id, contained_id in note2rels[note_name]:
                relation = anafora.AnaforaRelation()
                relation.id = str(random.random())[2:]
                relation.type = 'TLINK'
                relation.parents_type = 'TemporalRelations'
                relation.properties['Source'] = container_id
                relation.properties['Type'] = 'CONTAINS'
                relation.properties['Target'] = contained_id
                generated_data.annotations.append(relation)

            # write xml to file
            generated_data.indent()
            os.mkdir(os.path.join(self.out_dir, sub_dir))
            out_path = os.path.join(self.out_dir, sub_dir, file_names[0])
            generated_data.to_file(out_path)
def test_schema_validate():
    schema = anafora.validate.Schema(anafora.ElementTree.fromstring("""
        <schema>
        <defaultattribute>
            <required>True</required>
        </defaultattribute>
        <definition>
            <entities>
                <entity type="X">
                        <properties>
                                <property type="A" input="choice">x,y</property>
                                <property type="B" />
                                <property type="C" instanceOf="Y,Z" />
                        </properties>
                </entity>
                <entity type="Y" />
                <entity type="Z" />
            </entities>
            <relations>
                <relation type="R">
                    <properties>
                        <property type="D" instanceOf="X" required="False" />
                        <property type="E" instanceOf="Y,Z" required="False" />
                    </properties>
                </relation>
            </relations>
        </definition>
        </schema>
        """))
    data = anafora.AnaforaData()
    entity1 = anafora.AnaforaEntity()
    entity1.id = "@1@"
    entity1.type = "X"
    entity1.properties["A"] = "x"
    data.annotations.append(entity1)
    assert schema.errors(data)
    entity1.properties["B"] = "y"
    assert schema.errors(data)
    entity1.properties["C"] = "z"
    assert schema.errors(data)
    entity2 = anafora.AnaforaEntity()
    entity2.id = "@2@"
    entity2.type = "X"
    data.annotations.append(entity2)
    entity1.properties["C"] = entity2
    assert schema.errors(data)
    entity2.type = "Y"
    assert not schema.errors(data)
    entity1.properties["A"] = "y"
    assert not schema.errors(data)
    entity1.properties["A"] = "z"
    assert schema.errors(data)
    entity1.properties["A"] = "x"
    assert not schema.errors(data)

    relation = anafora.AnaforaRelation()
    relation.id = "@3@"
    relation.type = ""
    data.annotations.append(relation)
    assert schema.errors(data)
    relation.type = "R"
    assert not schema.errors(data)
    relation.properties["D"] = entity1
    assert not schema.errors(data)
    relation.properties["E"] = entity1
    assert schema.errors(data)
    relation.properties["E"] = entity2
    assert not schema.errors(data)
    relation.properties["X"] = "Y"
    assert schema.errors(data)
def convert_thyme_qa_to_anafora_xml(input_dir, output_dir):
    _header_sep_pattern = regex.compile(r'\s*=====+\s*')
    _annotation_sep_pattern = regex.compile(r'\s*-----+\s*')
    _annotation_pattern = regex.compile(
        r'^Question:(.*?)\nAnswer:(.*?)\nConfidence:(.*?)\n' +
        r'Difficulty:(.*?)\nDocTimeRel:(.*?)\n(Text Clip:.*)$', regex.DOTALL)
    _text_clip_pattern = regex.compile(
        r'Text Clip:\s+\d[\w.]*\s+(\d+),(\d+) (Exact|Support)_Answer ' +
        r'Use_(Time_Span|DocTimeRel) ?(.*)\n(.*)(?:\n|$)')

    # iterate through all _qa.txt files in the input directory
    for input_root, dir_names, input_file_names in os.walk(input_dir):
        for input_file_name in input_file_names:
            if input_file_name.endswith("_qa.txt"):
                file_base = input_file_name[:-7]

                # create one Anafora XML for each file
                data = anafora.AnaforaData()
                relation_count = 1
                entity_count = 1
                with open(os.path.join(input_root,
                                       input_file_name)) as input_file:
                    text = input_file.read().decode('ascii')

                    # parse the annotations from the THYME question-answer format
                    _, body_text = _header_sep_pattern.split(text)
                    for annotation_text in _annotation_sep_pattern.split(
                            body_text.rstrip(" \n\r-")):
                        match = _annotation_pattern.match(annotation_text)
                        if match is None:
                            raise ValueError("Invalid annotation text:\n" +
                                             annotation_text)
                        groups = [s.strip() for s in match.groups()]
                        question, answer, confidence, difficulty, doc_time_rel, text_clip_text = groups
                        text_clip_matches = _text_clip_pattern.findall(
                            text_clip_text)
                        if len(text_clip_text.splitlines()
                               ) != 2 * len(text_clip_matches):
                            raise ValueError(
                                "Invalid Text Clips in annotation text:\n" +
                                annotation_text)

                        # create Anafora XML annotations for the answers
                        entities = []
                        for begin_text, end_text, _, time_or_doc_time_rel, type_text, clip_text in text_clip_matches:
                            begin = int(begin_text)
                            end = int(end_text)
                            entity_annotation = anafora.AnaforaEntity()
                            entity_annotation.id = '{0:d}@{1}@{2}@gold'.format(
                                entity_count, 'e', file_base)
                            entity_annotation.spans = ((begin, end), )
                            entity_annotation.type = 'EVENT'
                            entity_annotation.parents_type = 'TemporalEntities'
                            if time_or_doc_time_rel == 'DocTimeRel':
                                entity_annotation.properties[
                                    'DocTimeRel'] = doc_time_rel.upper()
                            entity_count += 1
                            data.annotations.append(entity_annotation)
                            entities.append(entity_annotation)

                        # create an Anafora XML annotation for the question
                        question_annotation = anafora.AnaforaRelation()
                        question_annotation.id = '{0:d}@{1}@{2}@gold'.format(
                            relation_count, 'r', file_base)
                        question_annotation.type = 'Question'
                        question_annotation.parents_type = 'TemporalQuestions'
                        question_annotation.properties['Question'] = question
                        question_annotation.properties[
                            'Confidence'] = confidence
                        question_annotation.properties[
                            'Difficulty'] = difficulty
                        # FIXME: hacking XML here because current API doesn't allow properties with multiple values
                        for entity in entities:
                            property_elem = anafora.ElementTree.SubElement(
                                question_annotation.properties.xml, 'Answer')
                            property_elem.text = entity.id
                        data.annotations.append(question_annotation)
                        relation_count += 1

                # write the Anafora data out as XML
                output_file_dir = os.path.join(output_dir, file_base)
                output_file_path = os.path.join(
                    output_file_dir,
                    file_base + ".THYME_QA.preannotation.completed.xml")
                if not os.path.exists(output_file_dir):
                    os.makedirs(output_file_dir)
                data.indent()
                data.to_file(output_file_path)
def test_relation_to_closest():
    def get_xml():
        return anafora.ElementTree.fromstring("""
            <data>
                <annotations>
                    <entity>
                        <id>1</id>
                        <span>0,5</span>
                        <type>X</type>
                    </entity>
                    <entity>
                        <id>2</id>
                        <span>15,20</span>
                        <type>X</type>
                    </entity>
                    <entity>
                        <id>3</id>
                        <span>25,30</span>
                        <type>X</type>
                    </entity>
                    <entity>
                        <id>4</id>
                        <span>0,3</span>
                        <type>Y</type>
                    </entity>
                    <entity>
                        <id>5</id>
                        <span>21,24</span>
                        <type>Y</type>
                    </entity>
                    <entity>
                        <id>6</id>
                        <span>35,40</span>
                        <type>Y</type>
                    </entity>
                </annotations>
            </data>
            """)

    data = anafora.AnaforaData(get_xml())
    z1 = anafora.AnaforaRelation()
    z1.id = "7"
    data.annotations.append(z1)
    z1.type = "Z"
    z1.properties["source"] = data.annotations.select_id("1")
    z1.properties["target"] = data.annotations.select_id("4")
    z1.properties["foo"] = "bar"

    z2 = anafora.AnaforaRelation()
    z2.id = "8"
    data.annotations.append(z2)
    z2.type = "Z"
    z2.properties["source"] = data.annotations.select_id("2")
    z2.properties["target"] = data.annotations.select_id("5")
    z2.properties["foo"] = "bar"

    z3 = anafora.AnaforaRelation()
    z3.id = "9"
    data.annotations.append(z3)
    z3.type = "Z"
    z3.properties["source"] = data.annotations.select_id("3")
    z3.properties["target"] = data.annotations.select_id("5")
    z3.properties["foo"] = "bar"

    data = anafora.AnaforaData(get_xml())
    anafora.heuristic.add_relations_to_closest(data, "X", "Y", "Z", "source",
                                               "target", [("foo", "bar")])
    assert set(data.annotations.select_type("Z")) == {z1, z2, z3}

    # make sure it doesn't fail with 0 source and 0 target annotations
    data = anafora.AnaforaData(get_xml())
    anafora.heuristic.add_relations_to_closest(data, "A", "Y", "Z", "source",
                                               "target")
    data = anafora.AnaforaData(get_xml())
    anafora.heuristic.add_relations_to_closest(data, "X", "B", "Z", "source",
                                               "target")