def test_recursive_entity(): data = anafora.AnaforaData() entity = anafora.AnaforaEntity() entity.id = "@1@" data.annotations.append(entity) entity.properties["self"] = entity assert entity.is_self_referential() assert data.annotations.find_self_referential().id == entity.id data = anafora.AnaforaData() a = anafora.AnaforaEntity() a.id = "A" data.annotations.append(a) b = anafora.AnaforaEntity() b.id = "B" data.annotations.append(b) c = anafora.AnaforaEntity() c.id = "C" data.annotations.append(c) d = anafora.AnaforaEntity() d.id = "D" data.annotations.append(d) b.properties["x"] = a c.properties["y"] = a d.properties["1"] = b d.properties["2"] = c assert not d.is_self_referential()
def add_entity(data, doc_name, label, offset): if label is not None: anafora.AnaforaEntity() entity = anafora.AnaforaEntity() num_entities = len(data.xml.findall("annotations/entity")) entity.id = "%s@%s" % (num_entities, doc_name) entity.spans = ((offset[0], offset[1]), ) entity.type = label.replace("B-", "") data.annotations.append(entity)
def add_entity(data, doc_name, vote_one_result): anafora.AnaforaEntity() entity = anafora.AnaforaEntity() num_entities = len(data.xml.findall("annotations/entity")) entity.id = "%s@%s" % (num_entities, doc_name) vote_one_result_items = vote_one_result.split(":") entity.spans = ((int(vote_one_result_items[0]), int(vote_one_result_items[1])), ) entity.type = vote_one_result_items[2] data.annotations.append(entity)
def add_annotations_from(elem, offset=0): start = offset annotation = None if elem.tag in tag_id_attrs: annotation = anafora.AnaforaEntity( ) if elem.tag in entity_tags else anafora.AnaforaRelation() id_attr = tag_id_attrs[elem.tag] annotation.id = timeml_id_to_anafora_id[elem.attrib[id_attr]] annotation.type = elem.tag if isinstance(annotation, anafora.AnaforaEntity): annotation.spans = ((start, start), ) for name, value in elem.attrib.items(): if name != id_attr: if name in ref_id_attrs: value = timeml_id_to_anafora_id[value] annotation.properties[name] = value data.annotations.append(annotation) if elem.text is not None: offset += len(elem.text) for child in elem: offset = add_annotations_from(child, offset) if annotation is not None and isinstance(annotation, anafora.AnaforaEntity): annotation.spans = ((start, offset), ) if elem.text != text[start:offset]: raise ValueError('{0}: "{1}" != "{2}"'.format( timeml_path, elem.text, text[start:offset])) if elem.tail is not None: offset += len(elem.tail) return offset
def write(self, predictions): """Write predictions in anafora XML format""" index = 0 if os.path.isdir(self.out_dir): shutil.rmtree(self.out_dir) os.mkdir(self.out_dir) for sub_dir, text_name, file_names in \ anafora.walk(self.xml_dir, self.xml_regex): xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(xml_path) data = anafora.AnaforaData() for event in ref_data.annotations.select_type('EVENT'): entity = anafora.AnaforaEntity() entity.id = event.id start, end = event.spans[0] entity.spans = event.spans entity.type = event.type entity.properties['DocTimeRel'] = int2label[predictions[index]] data.annotations.append(entity) index = index + 1 os.mkdir(os.path.join(self.out_dir, sub_dir)) out_path = os.path.join(self.out_dir, sub_dir, file_names[0]) data.indent() data.to_file(out_path)
def test_add_entity(): data = anafora.AnaforaData() assert str(data) == '<data />' entity = anafora.AnaforaEntity() with pytest.raises(ValueError) as exception_info: data.annotations.append(entity) assert "id" in str(exception_info.value) assert str(data) == '<data />' entity.id = "1" data.annotations.append(entity) assert str( data ) == '<data><annotations><entity><id>1</id></entity></annotations></data>' entity.type = "X" entity.parents_type = "Y" entity.properties["name1"] = "value1" assert str(data) == ('<data><annotations><entity>' + '<id>1</id>' + '<type>X</type>' + '<parentsType>Y</parentsType>' + '<properties><name1>value1</name1></properties>' + '</entity></annotations></data>') del entity.properties["name1"] assert str(data) == ('<data><annotations><entity>' + '<id>1</id>' + '<type>X</type>' + '<parentsType>Y</parentsType>' + '</entity></annotations></data>') with pytest.raises(ValueError): del entity.properties["name2"]
def build_an_entity(current_label: list): entity = anafora.AnaforaEntity() entity.id = current_label["id"] entity.spans = (current_label["value"]["start"], current_label["value"]["end"]), entity.type = current_label["value"]["labels"][0] return entity
def copy_annotations(from_data, to_data, annot_type): """Copy id, spans, and type of an annotation of specific type""" for annot in from_data.annotations.select_type(annot_type): entity = anafora.AnaforaEntity() entity.id = annot.id entity.spans = annot.spans entity.type = annot.type to_data.annotations.append(entity)
def test_duplicate_id(): with pytest.raises(ValueError): anafora.AnaforaData( anafora.ElementTree.fromstring(''' <data> <annotations> <entity><id>1</id></entity> <entity><id>1</id></entity> </annotations> </data>''')) data = anafora.AnaforaData() entity1 = anafora.AnaforaEntity() entity1.id = "1" entity2 = anafora.AnaforaEntity() entity2.id = "1" data.annotations.append(entity1) with pytest.raises(ValueError): data.annotations.append(entity2)
def test_add_reference(): data = anafora.AnaforaData() entity1 = anafora.AnaforaEntity() entity1.id = "@1@" entity2 = anafora.AnaforaEntity() entity2.id = "@2@" with pytest.raises(ValueError) as exception_info: entity2.properties["link"] = entity1 assert "<annotations" in str(exception_info.value) data.annotations.append(entity1) with pytest.raises(ValueError): entity2.properties["link"] = entity1 assert "<annotations" in str(exception_info.value) data.annotations.append(entity2) entity2.properties["link"] = entity1 assert str(data) == ( '<data><annotations>' + '<entity><id>@1@</id></entity>' + '<entity><id>@2@</id><properties><link>@1@</link></properties></entity>' + '</annotations></data>')
def test_add_entity(): """Test testing testing""" data = anafora.AnaforaData() entity = anafora.AnaforaEntity() entity.id = '1@e@ID025_path_074@gold' data.annotations.append(entity) entity.type = 'EVENT' entity.parents_type = 'TemporalEntities' entity.properties['DocTimeRel'] = 'AFTER' data.indent() data.to_file('temp.xml')
def span2xmlfiles(data_spans, file_name_simple): import anafora data = anafora.AnaforaData() id = 0 for data_span in data_spans: e = anafora.AnaforaEntity() e.spans = ((int(data_span[0]), int(data_span[1]) + 1), ) e.type = data_span[2] e.id = str(id) + "@e@" + file_name_simple data.annotations.append(e) id += 1 data.indent() return data
def test_preannotated(): annotator = anafora.regex.RegexAnnotator( { 'aa+': ('A', { 'X': '2' }), 'a': ('A', {}), 'bb': ('B', { 'Y': '1' }) }, {'C': { 'Z': '3' }}) text = "bb aaa" data = anafora.AnaforaData() bb = anafora.AnaforaEntity() bb.id = "1@preannotated" bb.type = "B" bb.spans = ((0, 2), ) data.annotations.append(bb) aaa = anafora.AnaforaEntity() aaa.id = "2@preannotated" aaa.type = "C" aaa.spans = ((3, 6), ) data.annotations.append(aaa) annotator.annotate(text, data) assert len(list(data.annotations)) == 3 [b_annotation, c_annotation, a_annotation] = data.annotations assert b_annotation.type == "B" assert b_annotation.spans == ((0, 2), ) assert dict(b_annotation.properties.items()) == {'Y': '1'} assert c_annotation.type == "C" assert c_annotation.spans == ((3, 6), ) assert dict(c_annotation.properties.items()) == {'Z': '3'} assert a_annotation.type == "A" assert a_annotation.spans == ((3, 6), ) assert dict(a_annotation.properties.items()) == {'X': '2'}
def test_remove(): data = anafora.AnaforaData() assert str(data) == '<data />' entity1 = anafora.AnaforaEntity() entity1.id = "@1@" data.annotations.append(entity1) entity2 = anafora.AnaforaEntity() entity2.id = "@2@" entity2.properties["name"] = "value" data.annotations.append(entity2) assert list(data.annotations) == [entity1, entity2] assert str(data) == ( '<data><annotations>' + '<entity><id>@1@</id></entity>' + '<entity><id>@2@</id><properties><name>value</name></properties></entity>' + '</annotations></data>') data.annotations.remove(entity1) assert list(data.annotations) == [entity2] assert str(data) == ( '<data><annotations>' + '<entity><id>@2@</id><properties><name>value</name></properties></entity>' + '</annotations></data>') data.annotations.remove(entity2) assert list(data.annotations) == [] assert str(data) == '<data><annotations /></data>'
def write_xml(self, prediction_lookup): """Write predictions in anafora XML format""" # make a directory to write anafora xml if os.path.isdir(self.xml_out_dir): shutil.rmtree(self.xml_out_dir) os.mkdir(self.xml_out_dir) # t5 occasionally fails to predict missing_predictions = [] # iterate over reference xml files # look up the DTR prediction for each event # and write it in anafora format to specificed dir for sub_dir, text_name, file_names in \ anafora.walk(self.xml_ref_dir, xml_regex): path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(path) data = anafora.AnaforaData() for event in ref_data.annotations.select_type('EVENT'): # make a new entity and copy some ref info entity = anafora.AnaforaEntity() entity.id = event.id start, end = event.spans[0] entity.spans = event.spans entity.type = event.type # lookup the prediction key = '|'.join((sub_dir, str(start), str(end))) if key not in prediction_lookup: # use majority class for now entity.properties['DocTimeRel'] = 'OVERLAP' missing_predictions.append(key) else: entity.properties['DocTimeRel'] = prediction_lookup[key] data.annotations.append(entity) data.indent() os.mkdir(os.path.join(self.xml_out_dir, sub_dir)) out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0]) data.to_file(out_path) print('number of missing predictions:', len(missing_predictions))
def annotate(self, text, data): """ Adds annotations by matching the model's regular expressions against the text. :param str text: the text to be annotated :param anafora.AnaforaData data: the data to which the annotations should be added """ # index any existing annotations so we can add to them if necessary span_type_annotation_map = {} for annotation in data.annotations: span_type_annotation_map[annotation.spans, annotation.type] = annotation if self.default_type_attributes_map is not None: if annotation.type in self.default_type_attributes_map: for key, value in self.default_type_attributes_map[ annotation.type].items(): if key not in annotation.properties: annotation.properties[key] = value # create an overall regular expression where longest expressions are matched first # NOTE: we have to use the regex library, not the re library, because we need more that 100 groups patterns = sorted(self.regex_type_attributes_map, key=len, reverse=True) pattern = regex.compile('|'.join('({0})'.format(pattern) for pattern in patterns)) # for each match, create an annotation with the appropriate type and attributes for i, match in enumerate(pattern.finditer(text)): pattern = patterns[match.lastindex - 1] entity_type, attributes = self.regex_type_attributes_map[pattern] spans = ((match.start(), match.end()), ) key = (spans, entity_type) if key in span_type_annotation_map: entity = span_type_annotation_map[key] else: entity = anafora.AnaforaEntity() entity.id = "{0}@regex".format(i) entity.type = entity_type entity.spans = spans data.annotations.append(entity) for key, value in attributes.items(): entity.properties[key] = value
def write(self, predictions): """Write predictions in anafora XML format""" # predictions are in the same order in which they were read prediction_lookup = dict(zip(self.offsets, predictions)) # make a directory to write anafora xml if os.path.isdir(self.xml_out_dir): shutil.rmtree(self.xml_out_dir) os.mkdir(self.xml_out_dir) # iterate over reference xml files # look up the DTR prediction for each event # and write it in anafora format to specificed dir for sub_dir, text_name, file_names in \ anafora.walk(self.xml_ref_dir, xml_regex): path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(path) data = anafora.AnaforaData() for event in ref_data.annotations.select_type('EVENT'): # make a new entity and copy some ref info entity = anafora.AnaforaEntity() entity.id = event.id start, end = event.spans[0] entity.spans = event.spans entity.type = event.type # lookup the prediction if (sub_dir, start, end) not in prediction_lookup: print('missing key:', (sub_dir, start, end)) continue label = prediction_lookup[(sub_dir, start, end)] entity.properties['DocTimeRel'] = int2label[label] data.annotations.append(entity) data.indent() os.mkdir(os.path.join(self.xml_out_dir, sub_dir)) out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0]) data.to_file(out_path)
def span2xmlfiles(exp, target): import anafora raw_dir_simple = read1.read_from_json('raw_dir_simple') for data_id in range(0, 10): data_spans = read1.read_json(exp + "\\span_label_all" + target)[data_id] data = anafora.AnaforaData() id = 0 for data_span in data_spans: e = anafora.AnaforaEntity() e.spans = ((int(data_span[0]), int(data_span[1]) + 1), ) e.type = data_span[2] e.id = str(id) + "@e@" + raw_dir_simple[data_id] data.annotations.append(e) id += 1 print data data.indent() outputfile = exp + "\\" + raw_dir_simple[data_id] + "\\" if not os.path.exists(outputfile): os.makedirs(outputfile) data.to_file(outputfile + raw_dir_simple[data_id] + ".TimeNorm.gold.completed.xml")
def convert_thyme_qa_to_anafora_xml(input_dir, output_dir): _header_sep_pattern = regex.compile(r'\s*=====+\s*') _annotation_sep_pattern = regex.compile(r'\s*-----+\s*') _annotation_pattern = regex.compile( r'^Question:(.*?)\nAnswer:(.*?)\nConfidence:(.*?)\n' + r'Difficulty:(.*?)\nDocTimeRel:(.*?)\n(Text Clip:.*)$', regex.DOTALL) _text_clip_pattern = regex.compile( r'Text Clip:\s+\d[\w.]*\s+(\d+),(\d+) (Exact|Support)_Answer ' + r'Use_(Time_Span|DocTimeRel) ?(.*)\n(.*)(?:\n|$)') # iterate through all _qa.txt files in the input directory for input_root, dir_names, input_file_names in os.walk(input_dir): for input_file_name in input_file_names: if input_file_name.endswith("_qa.txt"): file_base = input_file_name[:-7] # create one Anafora XML for each file data = anafora.AnaforaData() relation_count = 1 entity_count = 1 with open(os.path.join(input_root, input_file_name)) as input_file: text = input_file.read().decode('ascii') # parse the annotations from the THYME question-answer format _, body_text = _header_sep_pattern.split(text) for annotation_text in _annotation_sep_pattern.split( body_text.rstrip(" \n\r-")): match = _annotation_pattern.match(annotation_text) if match is None: raise ValueError("Invalid annotation text:\n" + annotation_text) groups = [s.strip() for s in match.groups()] question, answer, confidence, difficulty, doc_time_rel, text_clip_text = groups text_clip_matches = _text_clip_pattern.findall( text_clip_text) if len(text_clip_text.splitlines() ) != 2 * len(text_clip_matches): raise ValueError( "Invalid Text Clips in annotation text:\n" + annotation_text) # create Anafora XML annotations for the answers entities = [] for begin_text, end_text, _, time_or_doc_time_rel, type_text, clip_text in text_clip_matches: begin = int(begin_text) end = int(end_text) entity_annotation = anafora.AnaforaEntity() entity_annotation.id = '{0:d}@{1}@{2}@gold'.format( entity_count, 'e', file_base) entity_annotation.spans = ((begin, end), ) entity_annotation.type = 'EVENT' entity_annotation.parents_type = 'TemporalEntities' if time_or_doc_time_rel == 'DocTimeRel': entity_annotation.properties[ 'DocTimeRel'] = doc_time_rel.upper() entity_count += 1 data.annotations.append(entity_annotation) entities.append(entity_annotation) # create an Anafora XML annotation for the question question_annotation = anafora.AnaforaRelation() question_annotation.id = '{0:d}@{1}@{2}@gold'.format( relation_count, 'r', file_base) question_annotation.type = 'Question' question_annotation.parents_type = 'TemporalQuestions' question_annotation.properties['Question'] = question question_annotation.properties[ 'Confidence'] = confidence question_annotation.properties[ 'Difficulty'] = difficulty # FIXME: hacking XML here because current API doesn't allow properties with multiple values for entity in entities: property_elem = anafora.ElementTree.SubElement( question_annotation.properties.xml, 'Answer') property_elem.text = entity.id data.annotations.append(question_annotation) relation_count += 1 # write the Anafora data out as XML output_file_dir = os.path.join(output_dir, file_base) output_file_path = os.path.join( output_file_dir, file_base + ".THYME_QA.preannotation.completed.xml") if not os.path.exists(output_file_dir): os.makedirs(output_file_dir) data.indent() data.to_file(output_file_path)
def test_schema_validate(): schema = anafora.validate.Schema(anafora.ElementTree.fromstring(""" <schema> <defaultattribute> <required>True</required> </defaultattribute> <definition> <entities> <entity type="X"> <properties> <property type="A" input="choice">x,y</property> <property type="B" /> <property type="C" instanceOf="Y,Z" /> </properties> </entity> <entity type="Y" /> <entity type="Z" /> </entities> <relations> <relation type="R"> <properties> <property type="D" instanceOf="X" required="False" /> <property type="E" instanceOf="Y,Z" required="False" /> </properties> </relation> </relations> </definition> </schema> """)) data = anafora.AnaforaData() entity1 = anafora.AnaforaEntity() entity1.id = "@1@" entity1.type = "X" entity1.properties["A"] = "x" data.annotations.append(entity1) assert schema.errors(data) entity1.properties["B"] = "y" assert schema.errors(data) entity1.properties["C"] = "z" assert schema.errors(data) entity2 = anafora.AnaforaEntity() entity2.id = "@2@" entity2.type = "X" data.annotations.append(entity2) entity1.properties["C"] = entity2 assert schema.errors(data) entity2.type = "Y" assert not schema.errors(data) entity1.properties["A"] = "y" assert not schema.errors(data) entity1.properties["A"] = "z" assert schema.errors(data) entity1.properties["A"] = "x" assert not schema.errors(data) relation = anafora.AnaforaRelation() relation.id = "@3@" relation.type = "" data.annotations.append(relation) assert schema.errors(data) relation.type = "R" assert not schema.errors(data) relation.properties["D"] = entity1 assert not schema.errors(data) relation.properties["E"] = entity1 assert schema.errors(data) relation.properties["E"] = entity2 assert not schema.errors(data) relation.properties["X"] = "Y" assert schema.errors(data)