def diff_files(first_name, second_name, result_name): first_bare = name_without_extension(first_name) second_bare = name_without_extension(second_name) result_bare = name_without_extension(result_name) first = annotation.TextAnnotations(first_bare) second = annotation.TextAnnotations(second_bare) result = copy_annotations(second_bare, result_bare) with result: AnnotationDiff(first, second, result).diff()
def merge_annotations_only_linking(identifier, result_dir, base_dir, linking_dir): result_ann_file = create_correction_file(identifier, result_dir, base_dir) base = annotation.TextAnnotations(os.path.join(base_dir, identifier)) linking = annotation.TextAnnotations(os.path.join(linking_dir, identifier)) for entity in base.get_entities(): transfer_comments(base, entity, [linking]) with codecs.open(result_ann_file, mode="w", encoding="utf-8") as outputFile: outputFile.write(str(base))
def get_annotator_brats(annotator_dirs, identifier): annotators_brat = dict() for dir in annotator_dirs: annotator = os.path.basename(dir) annotators_brat[annotator] = annotation.TextAnnotations( os.path.join(dir, identifier)) return annotators_brat
def main(argv=None): import sys import os if argv is None: argv = sys.argv arg = argparser().parse_args(argv[1:]) for fn in arg.files: try: projectconf = ProjectConfiguration(os.path.dirname(fn)) # remove ".a2" or ".rel" suffixes for Annotations to prompt # parsing of .a1 also. # (TODO: temporarily removing .ann also to work around a # bug in TextAnnotations, but this should not be necessary.) nosuff_fn = fn.replace(".a2", "").replace(".rel", "").replace(".ann", "") with annotation.TextAnnotations(nosuff_fn) as ann_obj: issues = verify_annotation(ann_obj, projectconf) for i in issues: print("%s:\t%s" % (fn, i.human_readable_str())) except annotation.AnnotationFileNotFoundError: print("%s:\tFailed check: file not found" % fn, file=sys.stderr) except annotation.AnnotationNotFoundError as e: print("%s:\tFailed check: %s" % (fn, e), file=sys.stderr) if arg.verbose: print("Check complete.", file=sys.stderr)
def correct_annotations(orig_fn, ann_fn, change_fn): with annotation.TextAnnotations(ann_fn) as anns: orig_text = anns.get_document_text() with annotation.open_textfile(change_fn, 'r') as f: changed_text = f.read() diffs = diff_match_patch().diff_main(orig_text, changed_text) orig_offset = 0 change_offset = 0 offsets = [] for diff in diffs: kind = diff[0] text = diff[1] size = len(text) delta = size * kind offsets.append((orig_offset, delta)) if kind != 1: orig_offset += size offsets = offsets[::-1] tbs = list(anns.get_textbounds()) indices = [] for tbi, tb in enumerate(tbs): for spani, span in enumerate(tb.spans): indices.append((span[0], tbi, spani, 0)) indices.append((span[1], tbi, spani, 1)) indices.sort(reverse=True) for orig_offset, delta in offsets: for index in indices: if index[0] < orig_offset: break frag = list(tbs[index[1]].spans[index[2]]) frag[index[3]] += delta tbs[index[1]].spans[index[2]] = tuple(frag) for tb in tbs: if isinstance(tb, annotation.TextBoundAnnotationWithText): tb.text = annotation.DISCONT_SEP.join((changed_text[start:end] for start, end in tb.spans)) copy(change_fn, orig_fn)
def copy_annotations(original_name, new_name): import shutil for extension in KNOWN_FILE_SUFF: try: shutil.copyfile('%s.%s' % (original_name, extension), '%s.%s' % (new_name, extension)) except IOError as e: pass # that extension file does not exist return annotation.TextAnnotations(new_name)
def load_doc(identifier): """An identifier is the path to a pair of a .ann file and a .txt file, minus the extension for either. """ try: return EnhancedAnnotatedDoc(annotation.TextAnnotations(identifier)) except AssertionError as e: sys.stderr.write("Failed to load doc {} with error {}\n".format( identifier, e))
def convert(doc_bare, result, ssplitter): ann = annotation.TextAnnotations(doc_bare) document = ET.Element("document") sentences = ET.SubElement(document, "sentences") annotations = ET.SubElement(document, "annotations") words = collect_sentences_and_get_words(sentences, ann, ssplitter) collect_annotations(annotations, ann, words) tree = ET.ElementTree(document) tree.write(result)
def ent2event(anntype, fn): global options mapped = 0 try: # remove possible .ann suffix to make TextAnnotations happy. nosuff_fn = fn.replace(".ann", "") with annotation.TextAnnotations(nosuff_fn) as ann_obj: for ann in ann_obj.get_entities(): if ann.type != anntype: # not targeted continue # map the entity annotation ann into an event. # first, create a new event annotation of the # same type for which ann is the trigger new_id = ann_obj.get_new_id('E') eann = annotation.EventAnnotation(ann.id, [], new_id, ann.type, '') # next, process existing event annotations, remapping ID # references to the source annotation into references to # the new annotation for e in ann_obj.get_events(): for i in range(0, len(e.args)): role, argid = e.args[i] if argid == ann.id: # need to remap argid = new_id e.args[i] = role, argid for c in ann_obj.get_oneline_comments(): if c.target == ann.id: # need to remap c.target = new_id # finally, add in the new event annotation ann_obj.add_annotation(eann) mapped += 1 if options.verbose: print(mapped, 'mapped in', fn, file=sys.stderr) except annotation.AnnotationFileNotFoundError: print("%s:\tFailed: file not found" % fn, file=sys.stderr) except annotation.AnnotationNotFoundError as e: print("%s:\tFailed: %s" % (fn, e), file=sys.stderr)
def ent2event(anntype, fn): global options mapped = 0 try: nosuff_fn = fn.replace(".ann","") with annotation.TextAnnotations(nosuff_fn) as ann_obj: for ann in ann_obj.get_entities(): if ann.type != anntype: continue new_id = ann_obj.get_new_id('E') eann = annotation.EventAnnotation(ann.id, [], new_id, ann.type, '') for e in ann_obj.get_events(): for i in range(0, len(e.args)): role, argid = e.args[i] if argid == ann.id: argid = new_id e.args[i] = role, argid for c in ann_obj.get_oneline_comments(): if c.target == ann.id: c.target = new_id ann_obj.add_annotation(eann) mapped += 1 if options.verbose: print >> sys.stderr, mapped, 'mapped in', fn except annotation.AnnotationFileNotFoundError: print >> sys.stderr, "%s:\tFailed: file not found" % fn except annotation.AnnotationNotFoundError, e: print >> sys.stderr, "%s:\tFailed: %s" % (fn, e)
def verify(args): """Verify that there are no contested annotations remaining in 'args.correction_dir'.""" identifiers = ai2_common.get_identifiers(args.correction_dir) not_finished = [] for identifier in identifiers: unresolved = [] brat = annotation.TextAnnotations(identifier) for a in brat: if isinstance(a, annotation.TypedAnnotation): if is_annotation_contested(a): unresolved.append(a) logging.debug("{} has {} unresolved conflicts".format( identifier, len(unresolved))) if unresolved: not_finished.append((identifier, unresolved)) if not_finished: logging.warn("{} files with unresolved annotations".format( len(not_finished))) sys.exit(1)
def display_discontinuous(files, verbose): total_entities = 0 discontinuous_entity_counts = [] for f in files: text_annotation = annotation.TextAnnotations(f) entity_count = len(list(text_annotation.get_entities())) discontinuous = find_discontinuous(text_annotation) discontinuous_count = len(discontinuous) total_entities += entity_count discontinuous_entity_counts.append(discontinuous_count) if verbose: if total_entities == 0 or discontinuous_count == 0: continue print("\n" + "-" * 60) print("{} discontinous entities out of {} in {}".format( discontinuous_count, entity_count, f)) for e in discontinuous: print_entity_mention(e, text_annotation.get_document_text()) total_discontinuous_entities = sum(discontinuous_entity_counts) print("Total number of entities: {}".format(total_entities)) print("Total number of discontinous entites: {}".format( total_discontinuous_entities))
def __filenames_to_annotations(filenames): """ Given file names, returns corresponding Annotations objects. """ # TODO: error output should be done via messager to allow # both command-line and GUI invocations global REPORT_SEARCH_TIMINGS if REPORT_SEARCH_TIMINGS: process_start = datetime.now() anns = [] for fn in filenames: try: # remove suffixes for Annotations to prompt parsing of .a1 # also. nosuff_fn = fn.replace(".ann","").replace(".a2","").replace(".rel","") ann_obj = annotation.TextAnnotations(nosuff_fn, read_only=True) anns.append(ann_obj) except annotation.AnnotationFileNotFoundError: print >> sys.stderr, "%s:\tFailed: file not found" % fn except annotation.AnnotationNotFoundError, e: print >> sys.stderr, "%s:\tFailed: %s" % (fn, e)
def display_overlapping(files, verbose): total_entities = 0 overlapping_entity_counts = [] for f in files: text_annotation = annotation.TextAnnotations(f) entity_count = len(list(text_annotation.get_entities())) overlapping = find_overlapping(text_annotation) overlapping_count = len(overlapping) total_entities += entity_count overlapping_entity_counts.append(overlapping_count) if verbose: if total_entities == 0 or overlapping_count == 0: continue print("\n" + "-" * 60) print("{} overlapping pairs out of {} entities in {}".format( overlapping_count, entity_count, f)) for e in overlapping: print_overlapping_entity_mentions( e[0], e[1], text_annotation.get_document_text()) total_overlapping_entities = sum(overlapping_entity_counts) print("Total number of entities: {}".format(total_entities)) print("Total number of overlapping entites: {}".format( total_overlapping_entities))
def name_without_extension(file_name): import re return re.sub(EXTENSIONS_RE, '', file_name) def copy_annotations(original_name, new_name): import shutil for extension in KNOWN_FILE_SUFF: try: shutil.copyfile('%s.%s' % (original_name, extension), '%s.%s' % (new_name, extension)) except IOError, e: pass # that extension file does not exist return annotation.TextAnnotations(new_name) def delete_annotations(name): bare_name = name_without_extension(name) for extension in KNOWN_FILE_SUFF: try: os.remove('%s.%s' % (name, extension)) except OSError, e: pass # that extension file does not exist def diff_files(first_name, second_name, result_name): first_bare = name_without_extension(first_name) second_bare = name_without_extension(second_name) result_bare = name_without_extension(result_name)
def merge_annotations(identifier, correction_dir, annotator_dirs): """Combines the brat annotations for 'identifier' from each dir in 'annotator_dirs'. Overwrites any existing file in 'correction_dir'. Works according to the scheme laid out in: docs.google.com/document/d/1zj5WAAykZfrPJwaKtv-AUD0m9BrVH6ybcl17PunnIgc It is a significant invariant that there will only be one entity with any given set of spans. """ annotator_brats = get_annotator_brats(annotator_dirs, identifier) annotators = annotator_brats.keys() brats = annotator_brats.values() correction_file = create_correction_file(identifier, correction_dir, annotator_dirs[0]) corrected = annotation.TextAnnotations( os.path.join(correction_dir, identifier)) all_entities = itertools.chain.from_iterable( (((e, b) for e in b.get_entities()) for b in brats)) accounted_for = set() no_perfect_match = [] # Entities with perfect span matches for (entity, from_brat) in all_entities: if entity in accounted_for: continue matches = get_entity_matches(entity, brats) accounted_for.update(set(matches)) if len(matches) < len(annotators): no_perfect_match.append((entity, from_brat)) else: ann = suffix_annotation_id(get_annotator(from_brat), entity) types = set((e.type for e in matches)) if len(types) > 1: # Type of the entity is contested ann = set_annotation_type(ann, "FIX_TYPE") corrected.add_annotation(ann) for (entity, from_brat) in no_perfect_match: id_prefixed = suffix_annotation_id(get_annotator(from_brat), entity) if len(get_entity_overlaps(entity, brats)) > 1: # With some overlap (other than itself) ann = prefix_annotation_type(id_prefixed, "FIX_SPAN_") else: # With no overlap ann = prefix_annotation_type(id_prefixed, "VERIFY_") corrected.add_annotation(ann) # Transfer comments on entities for entity in corrected.get_entities(): transfer_comments(corrected, entity, brats) all_relations = itertools.chain.from_iterable( (((r, b) for r in b.get_relations()) for b in brats)) accounted_for = set() no_perfect_match = [] # Relations for which the arguments have perfect span matches for (relation, from_brat) in all_relations: if relation in accounted_for: continue matches = get_relation_matches(relation, from_brat, brats) accounted_for.update(set(matches)) if len(matches) < len(annotators): no_perfect_match.append((relation, from_brat)) else: # Relation needs to refer to entities in the new set ann = translate_relation(relation, from_brat, corrected) ann = suffix_annotation_id(get_annotator(from_brat), ann) types = set((r.type for r in matches)) if len(types) > 1: # Type of the relation is contested ann = set_annotation_type(ann, "FIX_RELATION_TYPE") corrected.add_annotation(ann) for (relation, from_brat) in no_perfect_match: ann = prefix_annotation_type( suffix_annotation_id( get_annotator(from_brat), translate_relation(relation, from_brat, corrected)), "VERIFY_") corrected.add_annotation(ann) with codecs.open(correction_file, mode="w", encoding="utf-8") as outputFile: outputFile.write(str(corrected))