コード例 #1
0
def test_recursive_entity():
    data = anafora.AnaforaData()
    entity = anafora.AnaforaEntity()
    entity.id = "@1@"
    data.annotations.append(entity)
    entity.properties["self"] = entity
    assert entity.is_self_referential()
    assert data.annotations.find_self_referential().id == entity.id

    data = anafora.AnaforaData()
    a = anafora.AnaforaEntity()
    a.id = "A"
    data.annotations.append(a)
    b = anafora.AnaforaEntity()
    b.id = "B"
    data.annotations.append(b)
    c = anafora.AnaforaEntity()
    c.id = "C"
    data.annotations.append(c)
    d = anafora.AnaforaEntity()
    d.id = "D"
    data.annotations.append(d)
    b.properties["x"] = a
    c.properties["y"] = a
    d.properties["1"] = b
    d.properties["2"] = c
    assert not d.is_self_referential()
コード例 #2
0
def add_entity(data, doc_name, label, offset):
    if label is not None:
        anafora.AnaforaEntity()
        entity = anafora.AnaforaEntity()
        num_entities = len(data.xml.findall("annotations/entity"))
        entity.id = "%s@%s" % (num_entities, doc_name)
        entity.spans = ((offset[0], offset[1]), )
        entity.type = label.replace("B-", "")
        data.annotations.append(entity)
コード例 #3
0
def add_entity(data, doc_name, vote_one_result):
    anafora.AnaforaEntity()
    entity = anafora.AnaforaEntity()
    num_entities = len(data.xml.findall("annotations/entity"))
    entity.id = "%s@%s" % (num_entities, doc_name)
    vote_one_result_items = vote_one_result.split(":")
    entity.spans = ((int(vote_one_result_items[0]),
                     int(vote_one_result_items[1])), )
    entity.type = vote_one_result_items[2]
    data.annotations.append(entity)
コード例 #4
0
    def add_annotations_from(elem, offset=0):
        start = offset
        annotation = None
        if elem.tag in tag_id_attrs:
            annotation = anafora.AnaforaEntity(
            ) if elem.tag in entity_tags else anafora.AnaforaRelation()
            id_attr = tag_id_attrs[elem.tag]
            annotation.id = timeml_id_to_anafora_id[elem.attrib[id_attr]]
            annotation.type = elem.tag
            if isinstance(annotation, anafora.AnaforaEntity):
                annotation.spans = ((start, start), )
            for name, value in elem.attrib.items():
                if name != id_attr:
                    if name in ref_id_attrs:
                        value = timeml_id_to_anafora_id[value]
                    annotation.properties[name] = value
            data.annotations.append(annotation)

        if elem.text is not None:
            offset += len(elem.text)
        for child in elem:
            offset = add_annotations_from(child, offset)

        if annotation is not None and isinstance(annotation,
                                                 anafora.AnaforaEntity):
            annotation.spans = ((start, offset), )
            if elem.text != text[start:offset]:
                raise ValueError('{0}: "{1}" != "{2}"'.format(
                    timeml_path, elem.text, text[start:offset]))

        if elem.tail is not None:
            offset += len(elem.tail)
        return offset
コード例 #5
0
  def write(self, predictions):
    """Write predictions in anafora XML format"""

    index = 0

    if os.path.isdir(self.out_dir):
      shutil.rmtree(self.out_dir)
    os.mkdir(self.out_dir)

    for sub_dir, text_name, file_names in \
            anafora.walk(self.xml_dir, self.xml_regex):

      xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0])
      ref_data = anafora.AnaforaData.from_file(xml_path)

      data = anafora.AnaforaData()

      for event in ref_data.annotations.select_type('EVENT'):
        entity = anafora.AnaforaEntity()

        entity.id = event.id
        start, end = event.spans[0]
        entity.spans = event.spans
        entity.type = event.type
        entity.properties['DocTimeRel'] = int2label[predictions[index]]

        data.annotations.append(entity)
        index = index + 1

      os.mkdir(os.path.join(self.out_dir, sub_dir))
      out_path = os.path.join(self.out_dir, sub_dir, file_names[0])

      data.indent()
      data.to_file(out_path)
コード例 #6
0
def test_add_entity():
    data = anafora.AnaforaData()
    assert str(data) == '<data />'
    entity = anafora.AnaforaEntity()
    with pytest.raises(ValueError) as exception_info:
        data.annotations.append(entity)
    assert "id" in str(exception_info.value)
    assert str(data) == '<data />'
    entity.id = "1"
    data.annotations.append(entity)
    assert str(
        data
    ) == '<data><annotations><entity><id>1</id></entity></annotations></data>'
    entity.type = "X"
    entity.parents_type = "Y"
    entity.properties["name1"] = "value1"
    assert str(data) == ('<data><annotations><entity>' + '<id>1</id>' +
                         '<type>X</type>' + '<parentsType>Y</parentsType>' +
                         '<properties><name1>value1</name1></properties>' +
                         '</entity></annotations></data>')
    del entity.properties["name1"]
    assert str(data) == ('<data><annotations><entity>' + '<id>1</id>' +
                         '<type>X</type>' + '<parentsType>Y</parentsType>' +
                         '</entity></annotations></data>')
    with pytest.raises(ValueError):
        del entity.properties["name2"]
コード例 #7
0
def build_an_entity(current_label: list):
    entity = anafora.AnaforaEntity()
    entity.id = current_label["id"]
    entity.spans = (current_label["value"]["start"],
                    current_label["value"]["end"]),
    entity.type = current_label["value"]["labels"][0]
    return entity
コード例 #8
0
def copy_annotations(from_data, to_data, annot_type):
    """Copy id, spans, and type of an annotation of specific type"""

    for annot in from_data.annotations.select_type(annot_type):
        entity = anafora.AnaforaEntity()
        entity.id = annot.id
        entity.spans = annot.spans
        entity.type = annot.type
        to_data.annotations.append(entity)
コード例 #9
0
def test_duplicate_id():
    with pytest.raises(ValueError):
        anafora.AnaforaData(
            anafora.ElementTree.fromstring('''
        <data>
            <annotations>
                <entity><id>1</id></entity>
                <entity><id>1</id></entity>
            </annotations>
        </data>'''))

    data = anafora.AnaforaData()
    entity1 = anafora.AnaforaEntity()
    entity1.id = "1"
    entity2 = anafora.AnaforaEntity()
    entity2.id = "1"
    data.annotations.append(entity1)
    with pytest.raises(ValueError):
        data.annotations.append(entity2)
コード例 #10
0
def test_add_reference():
    data = anafora.AnaforaData()
    entity1 = anafora.AnaforaEntity()
    entity1.id = "@1@"
    entity2 = anafora.AnaforaEntity()
    entity2.id = "@2@"
    with pytest.raises(ValueError) as exception_info:
        entity2.properties["link"] = entity1
    assert "<annotations" in str(exception_info.value)
    data.annotations.append(entity1)
    with pytest.raises(ValueError):
        entity2.properties["link"] = entity1
    assert "<annotations" in str(exception_info.value)
    data.annotations.append(entity2)
    entity2.properties["link"] = entity1
    assert str(data) == (
        '<data><annotations>' + '<entity><id>@1@</id></entity>' +
        '<entity><id>@2@</id><properties><link>@1@</link></properties></entity>'
        + '</annotations></data>')
コード例 #11
0
ファイル: writer.py プロジェクト: dmitriydligach/Thyme
def test_add_entity():
    """Test testing testing"""

    data = anafora.AnaforaData()
    entity = anafora.AnaforaEntity()
    entity.id = '1@e@ID025_path_074@gold'
    data.annotations.append(entity)
    entity.type = 'EVENT'
    entity.parents_type = 'TemporalEntities'
    entity.properties['DocTimeRel'] = 'AFTER'

    data.indent()
    data.to_file('temp.xml')
コード例 #12
0
def span2xmlfiles(data_spans, file_name_simple):
    import anafora
    data = anafora.AnaforaData()
    id = 0
    for data_span in data_spans:
        e = anafora.AnaforaEntity()
        e.spans = ((int(data_span[0]), int(data_span[1]) + 1), )
        e.type = data_span[2]
        e.id = str(id) + "@e@" + file_name_simple
        data.annotations.append(e)
        id += 1
    data.indent()
    return data
コード例 #13
0
def test_preannotated():
    annotator = anafora.regex.RegexAnnotator(
        {
            'aa+': ('A', {
                'X': '2'
            }),
            'a': ('A', {}),
            'bb': ('B', {
                'Y': '1'
            })
        }, {'C': {
            'Z': '3'
        }})
    text = "bb aaa"
    data = anafora.AnaforaData()
    bb = anafora.AnaforaEntity()
    bb.id = "1@preannotated"
    bb.type = "B"
    bb.spans = ((0, 2), )
    data.annotations.append(bb)
    aaa = anafora.AnaforaEntity()
    aaa.id = "2@preannotated"
    aaa.type = "C"
    aaa.spans = ((3, 6), )
    data.annotations.append(aaa)
    annotator.annotate(text, data)

    assert len(list(data.annotations)) == 3
    [b_annotation, c_annotation, a_annotation] = data.annotations
    assert b_annotation.type == "B"
    assert b_annotation.spans == ((0, 2), )
    assert dict(b_annotation.properties.items()) == {'Y': '1'}
    assert c_annotation.type == "C"
    assert c_annotation.spans == ((3, 6), )
    assert dict(c_annotation.properties.items()) == {'Z': '3'}
    assert a_annotation.type == "A"
    assert a_annotation.spans == ((3, 6), )
    assert dict(a_annotation.properties.items()) == {'X': '2'}
コード例 #14
0
def test_remove():
    data = anafora.AnaforaData()
    assert str(data) == '<data />'
    entity1 = anafora.AnaforaEntity()
    entity1.id = "@1@"
    data.annotations.append(entity1)
    entity2 = anafora.AnaforaEntity()
    entity2.id = "@2@"
    entity2.properties["name"] = "value"
    data.annotations.append(entity2)
    assert list(data.annotations) == [entity1, entity2]
    assert str(data) == (
        '<data><annotations>' + '<entity><id>@1@</id></entity>' +
        '<entity><id>@2@</id><properties><name>value</name></properties></entity>'
        + '</annotations></data>')
    data.annotations.remove(entity1)
    assert list(data.annotations) == [entity2]
    assert str(data) == (
        '<data><annotations>' +
        '<entity><id>@2@</id><properties><name>value</name></properties></entity>'
        + '</annotations></data>')
    data.annotations.remove(entity2)
    assert list(data.annotations) == []
    assert str(data) == '<data><annotations /></data>'
コード例 #15
0
    def write_xml(self, prediction_lookup):
        """Write predictions in anafora XML format"""

        # make a directory to write anafora xml
        if os.path.isdir(self.xml_out_dir):
            shutil.rmtree(self.xml_out_dir)
        os.mkdir(self.xml_out_dir)

        # t5 occasionally fails to predict
        missing_predictions = []

        # iterate over reference xml files
        # look up the DTR prediction for each event
        # and write it in anafora format to specificed dir
        for sub_dir, text_name, file_names in \
                anafora.walk(self.xml_ref_dir, xml_regex):

            path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)
            data = anafora.AnaforaData()

            for event in ref_data.annotations.select_type('EVENT'):

                # make a new entity and copy some ref info
                entity = anafora.AnaforaEntity()
                entity.id = event.id
                start, end = event.spans[0]
                entity.spans = event.spans
                entity.type = event.type

                # lookup the prediction
                key = '|'.join((sub_dir, str(start), str(end)))
                if key not in prediction_lookup:
                    # use majority class for now
                    entity.properties['DocTimeRel'] = 'OVERLAP'
                    missing_predictions.append(key)
                else:
                    entity.properties['DocTimeRel'] = prediction_lookup[key]

                data.annotations.append(entity)

            data.indent()
            os.mkdir(os.path.join(self.xml_out_dir, sub_dir))
            out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0])
            data.to_file(out_path)

        print('number of missing predictions:', len(missing_predictions))
コード例 #16
0
    def annotate(self, text, data):
        """
        Adds annotations by matching the model's regular expressions against the text.

        :param str text: the text to be annotated
        :param anafora.AnaforaData data: the data to which the annotations should be added
        """

        # index any existing annotations so we can add to them if necessary
        span_type_annotation_map = {}
        for annotation in data.annotations:
            span_type_annotation_map[annotation.spans,
                                     annotation.type] = annotation
            if self.default_type_attributes_map is not None:
                if annotation.type in self.default_type_attributes_map:
                    for key, value in self.default_type_attributes_map[
                            annotation.type].items():
                        if key not in annotation.properties:
                            annotation.properties[key] = value

        # create an overall regular expression where longest expressions are matched first
        # NOTE: we have to use the regex library, not the re library, because we need more that 100 groups
        patterns = sorted(self.regex_type_attributes_map,
                          key=len,
                          reverse=True)
        pattern = regex.compile('|'.join('({0})'.format(pattern)
                                         for pattern in patterns))

        # for each match, create an annotation with the appropriate type and attributes
        for i, match in enumerate(pattern.finditer(text)):
            pattern = patterns[match.lastindex - 1]
            entity_type, attributes = self.regex_type_attributes_map[pattern]
            spans = ((match.start(), match.end()), )
            key = (spans, entity_type)
            if key in span_type_annotation_map:
                entity = span_type_annotation_map[key]
            else:
                entity = anafora.AnaforaEntity()
                entity.id = "{0}@regex".format(i)
                entity.type = entity_type
                entity.spans = spans
                data.annotations.append(entity)
            for key, value in attributes.items():
                entity.properties[key] = value
コード例 #17
0
ファイル: dtrdata.py プロジェクト: dmitriydligach/Thyme
    def write(self, predictions):
        """Write predictions in anafora XML format"""

        # predictions are in the same order in which they were read
        prediction_lookup = dict(zip(self.offsets, predictions))

        # make a directory to write anafora xml
        if os.path.isdir(self.xml_out_dir):
            shutil.rmtree(self.xml_out_dir)
        os.mkdir(self.xml_out_dir)

        # iterate over reference xml files
        # look up the DTR prediction for each event
        # and write it in anafora format to specificed dir
        for sub_dir, text_name, file_names in \
                anafora.walk(self.xml_ref_dir, xml_regex):

            path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0])
            ref_data = anafora.AnaforaData.from_file(path)
            data = anafora.AnaforaData()

            for event in ref_data.annotations.select_type('EVENT'):

                # make a new entity and copy some ref info
                entity = anafora.AnaforaEntity()
                entity.id = event.id
                start, end = event.spans[0]
                entity.spans = event.spans
                entity.type = event.type

                # lookup the prediction
                if (sub_dir, start, end) not in prediction_lookup:
                    print('missing key:', (sub_dir, start, end))
                    continue

                label = prediction_lookup[(sub_dir, start, end)]
                entity.properties['DocTimeRel'] = int2label[label]

                data.annotations.append(entity)

            data.indent()
            os.mkdir(os.path.join(self.xml_out_dir, sub_dir))
            out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0])
            data.to_file(out_path)
コード例 #18
0
def span2xmlfiles(exp, target):
    import anafora

    raw_dir_simple = read1.read_from_json('raw_dir_simple')
    for data_id in range(0, 10):
        data_spans = read1.read_json(exp + "\\span_label_all" +
                                     target)[data_id]
        data = anafora.AnaforaData()
        id = 0
        for data_span in data_spans:
            e = anafora.AnaforaEntity()
            e.spans = ((int(data_span[0]), int(data_span[1]) + 1), )
            e.type = data_span[2]
            e.id = str(id) + "@e@" + raw_dir_simple[data_id]
            data.annotations.append(e)
            id += 1
        print data
        data.indent()

        outputfile = exp + "\\" + raw_dir_simple[data_id] + "\\"
        if not os.path.exists(outputfile):
            os.makedirs(outputfile)
        data.to_file(outputfile + raw_dir_simple[data_id] +
                     ".TimeNorm.gold.completed.xml")
コード例 #19
0
def convert_thyme_qa_to_anafora_xml(input_dir, output_dir):
    _header_sep_pattern = regex.compile(r'\s*=====+\s*')
    _annotation_sep_pattern = regex.compile(r'\s*-----+\s*')
    _annotation_pattern = regex.compile(
        r'^Question:(.*?)\nAnswer:(.*?)\nConfidence:(.*?)\n' +
        r'Difficulty:(.*?)\nDocTimeRel:(.*?)\n(Text Clip:.*)$', regex.DOTALL)
    _text_clip_pattern = regex.compile(
        r'Text Clip:\s+\d[\w.]*\s+(\d+),(\d+) (Exact|Support)_Answer ' +
        r'Use_(Time_Span|DocTimeRel) ?(.*)\n(.*)(?:\n|$)')

    # iterate through all _qa.txt files in the input directory
    for input_root, dir_names, input_file_names in os.walk(input_dir):
        for input_file_name in input_file_names:
            if input_file_name.endswith("_qa.txt"):
                file_base = input_file_name[:-7]

                # create one Anafora XML for each file
                data = anafora.AnaforaData()
                relation_count = 1
                entity_count = 1
                with open(os.path.join(input_root,
                                       input_file_name)) as input_file:
                    text = input_file.read().decode('ascii')

                    # parse the annotations from the THYME question-answer format
                    _, body_text = _header_sep_pattern.split(text)
                    for annotation_text in _annotation_sep_pattern.split(
                            body_text.rstrip(" \n\r-")):
                        match = _annotation_pattern.match(annotation_text)
                        if match is None:
                            raise ValueError("Invalid annotation text:\n" +
                                             annotation_text)
                        groups = [s.strip() for s in match.groups()]
                        question, answer, confidence, difficulty, doc_time_rel, text_clip_text = groups
                        text_clip_matches = _text_clip_pattern.findall(
                            text_clip_text)
                        if len(text_clip_text.splitlines()
                               ) != 2 * len(text_clip_matches):
                            raise ValueError(
                                "Invalid Text Clips in annotation text:\n" +
                                annotation_text)

                        # create Anafora XML annotations for the answers
                        entities = []
                        for begin_text, end_text, _, time_or_doc_time_rel, type_text, clip_text in text_clip_matches:
                            begin = int(begin_text)
                            end = int(end_text)
                            entity_annotation = anafora.AnaforaEntity()
                            entity_annotation.id = '{0:d}@{1}@{2}@gold'.format(
                                entity_count, 'e', file_base)
                            entity_annotation.spans = ((begin, end), )
                            entity_annotation.type = 'EVENT'
                            entity_annotation.parents_type = 'TemporalEntities'
                            if time_or_doc_time_rel == 'DocTimeRel':
                                entity_annotation.properties[
                                    'DocTimeRel'] = doc_time_rel.upper()
                            entity_count += 1
                            data.annotations.append(entity_annotation)
                            entities.append(entity_annotation)

                        # create an Anafora XML annotation for the question
                        question_annotation = anafora.AnaforaRelation()
                        question_annotation.id = '{0:d}@{1}@{2}@gold'.format(
                            relation_count, 'r', file_base)
                        question_annotation.type = 'Question'
                        question_annotation.parents_type = 'TemporalQuestions'
                        question_annotation.properties['Question'] = question
                        question_annotation.properties[
                            'Confidence'] = confidence
                        question_annotation.properties[
                            'Difficulty'] = difficulty
                        # FIXME: hacking XML here because current API doesn't allow properties with multiple values
                        for entity in entities:
                            property_elem = anafora.ElementTree.SubElement(
                                question_annotation.properties.xml, 'Answer')
                            property_elem.text = entity.id
                        data.annotations.append(question_annotation)
                        relation_count += 1

                # write the Anafora data out as XML
                output_file_dir = os.path.join(output_dir, file_base)
                output_file_path = os.path.join(
                    output_file_dir,
                    file_base + ".THYME_QA.preannotation.completed.xml")
                if not os.path.exists(output_file_dir):
                    os.makedirs(output_file_dir)
                data.indent()
                data.to_file(output_file_path)
def test_schema_validate():
    schema = anafora.validate.Schema(anafora.ElementTree.fromstring("""
        <schema>
        <defaultattribute>
            <required>True</required>
        </defaultattribute>
        <definition>
            <entities>
                <entity type="X">
                        <properties>
                                <property type="A" input="choice">x,y</property>
                                <property type="B" />
                                <property type="C" instanceOf="Y,Z" />
                        </properties>
                </entity>
                <entity type="Y" />
                <entity type="Z" />
            </entities>
            <relations>
                <relation type="R">
                    <properties>
                        <property type="D" instanceOf="X" required="False" />
                        <property type="E" instanceOf="Y,Z" required="False" />
                    </properties>
                </relation>
            </relations>
        </definition>
        </schema>
        """))
    data = anafora.AnaforaData()
    entity1 = anafora.AnaforaEntity()
    entity1.id = "@1@"
    entity1.type = "X"
    entity1.properties["A"] = "x"
    data.annotations.append(entity1)
    assert schema.errors(data)
    entity1.properties["B"] = "y"
    assert schema.errors(data)
    entity1.properties["C"] = "z"
    assert schema.errors(data)
    entity2 = anafora.AnaforaEntity()
    entity2.id = "@2@"
    entity2.type = "X"
    data.annotations.append(entity2)
    entity1.properties["C"] = entity2
    assert schema.errors(data)
    entity2.type = "Y"
    assert not schema.errors(data)
    entity1.properties["A"] = "y"
    assert not schema.errors(data)
    entity1.properties["A"] = "z"
    assert schema.errors(data)
    entity1.properties["A"] = "x"
    assert not schema.errors(data)

    relation = anafora.AnaforaRelation()
    relation.id = "@3@"
    relation.type = ""
    data.annotations.append(relation)
    assert schema.errors(data)
    relation.type = "R"
    assert not schema.errors(data)
    relation.properties["D"] = entity1
    assert not schema.errors(data)
    relation.properties["E"] = entity1
    assert schema.errors(data)
    relation.properties["E"] = entity2
    assert not schema.errors(data)
    relation.properties["X"] = "Y"
    assert schema.errors(data)