Ejemplo n.º 1
0
def diff_files(first_name, second_name, result_name):
    first_bare = name_without_extension(first_name)
    second_bare = name_without_extension(second_name)
    result_bare = name_without_extension(result_name)

    first = annotation.TextAnnotations(first_bare)
    second = annotation.TextAnnotations(second_bare)
    result = copy_annotations(second_bare, result_bare)

    with result:
        AnnotationDiff(first, second, result).diff()
Ejemplo n.º 2
0
def merge_annotations_only_linking(identifier, result_dir, base_dir,
                                   linking_dir):
    result_ann_file = create_correction_file(identifier, result_dir, base_dir)
    base = annotation.TextAnnotations(os.path.join(base_dir, identifier))
    linking = annotation.TextAnnotations(os.path.join(linking_dir, identifier))

    for entity in base.get_entities():
        transfer_comments(base, entity, [linking])

    with codecs.open(result_ann_file, mode="w",
                     encoding="utf-8") as outputFile:
        outputFile.write(str(base))
Ejemplo n.º 3
0
def get_annotator_brats(annotator_dirs, identifier):
    annotators_brat = dict()
    for dir in annotator_dirs:
        annotator = os.path.basename(dir)
        annotators_brat[annotator] = annotation.TextAnnotations(
            os.path.join(dir, identifier))
    return annotators_brat
def main(argv=None):
    import sys
    import os

    if argv is None:
        argv = sys.argv
    arg = argparser().parse_args(argv[1:])

    for fn in arg.files:
        try:
            projectconf = ProjectConfiguration(os.path.dirname(fn))
            # remove ".a2" or ".rel" suffixes for Annotations to prompt
            # parsing of .a1 also.
            # (TODO: temporarily removing .ann also to work around a
            # bug in TextAnnotations, but this should not be necessary.)
            nosuff_fn = fn.replace(".a2", "").replace(".rel",
                                                      "").replace(".ann", "")
            with annotation.TextAnnotations(nosuff_fn) as ann_obj:
                issues = verify_annotation(ann_obj, projectconf)
                for i in issues:
                    print("%s:\t%s" % (fn, i.human_readable_str()))
        except annotation.AnnotationFileNotFoundError:
            print("%s:\tFailed check: file not found" % fn, file=sys.stderr)
        except annotation.AnnotationNotFoundError as e:
            print("%s:\tFailed check: %s" % (fn, e), file=sys.stderr)

    if arg.verbose:
        print("Check complete.", file=sys.stderr)
def correct_annotations(orig_fn, ann_fn, change_fn):
    with annotation.TextAnnotations(ann_fn) as anns:
        orig_text = anns.get_document_text()
        with annotation.open_textfile(change_fn, 'r') as f:
            changed_text = f.read()
        diffs = diff_match_patch().diff_main(orig_text, changed_text)
        orig_offset = 0
        change_offset = 0
        offsets = []
        for diff in diffs:
            kind = diff[0]
            text = diff[1]
            size = len(text)
            delta = size * kind
            offsets.append((orig_offset, delta))
            if kind != 1:
                orig_offset += size
        offsets = offsets[::-1]        
        tbs = list(anns.get_textbounds())
        indices = []
        for tbi, tb in enumerate(tbs):
            for spani, span in enumerate(tb.spans):
                indices.append((span[0], tbi, spani, 0))
                indices.append((span[1], tbi, spani, 1))
        indices.sort(reverse=True)
        for orig_offset, delta in offsets:
            for index in indices:
                if index[0] < orig_offset: break
                frag = list(tbs[index[1]].spans[index[2]])
                frag[index[3]] += delta
                tbs[index[1]].spans[index[2]] = tuple(frag)
        for tb in tbs:
            if isinstance(tb, annotation.TextBoundAnnotationWithText):
                tb.text = annotation.DISCONT_SEP.join((changed_text[start:end] for start, end in tb.spans))
    copy(change_fn, orig_fn)
Ejemplo n.º 6
0
def copy_annotations(original_name, new_name):
    import shutil
    for extension in KNOWN_FILE_SUFF:
        try:
            shutil.copyfile('%s.%s' % (original_name, extension),
                            '%s.%s' % (new_name, extension))
        except IOError as e:
            pass  # that extension file does not exist
    return annotation.TextAnnotations(new_name)
Ejemplo n.º 7
0
def load_doc(identifier):
    """An identifier is the path to a pair of a .ann file and a .txt file, minus
    the extension for either.

    """
    try:
        return EnhancedAnnotatedDoc(annotation.TextAnnotations(identifier))
    except AssertionError as e:
        sys.stderr.write("Failed to load doc {} with error {}\n".format(
            identifier, e))
Ejemplo n.º 8
0
def convert(doc_bare, result, ssplitter):
    ann = annotation.TextAnnotations(doc_bare)
    document = ET.Element("document")
    sentences = ET.SubElement(document, "sentences")
    annotations = ET.SubElement(document, "annotations")

    words = collect_sentences_and_get_words(sentences, ann, ssplitter)
    collect_annotations(annotations, ann, words)

    tree = ET.ElementTree(document)
    tree.write(result)
Ejemplo n.º 9
0
def ent2event(anntype, fn):
    global options

    mapped = 0

    try:
        # remove possible .ann suffix to make TextAnnotations happy.
        nosuff_fn = fn.replace(".ann", "")

        with annotation.TextAnnotations(nosuff_fn) as ann_obj:

            for ann in ann_obj.get_entities():
                if ann.type != anntype:
                    # not targeted
                    continue

                # map the entity annotation ann into an event.

                # first, create a new event annotation of the
                # same type for which ann is the trigger
                new_id = ann_obj.get_new_id('E')
                eann = annotation.EventAnnotation(ann.id, [], new_id, ann.type,
                                                  '')

                # next, process existing event annotations, remapping ID
                # references to the source annotation into references to
                # the new annotation
                for e in ann_obj.get_events():
                    for i in range(0, len(e.args)):
                        role, argid = e.args[i]
                        if argid == ann.id:
                            # need to remap
                            argid = new_id
                            e.args[i] = role, argid
                for c in ann_obj.get_oneline_comments():
                    if c.target == ann.id:
                        # need to remap
                        c.target = new_id

                # finally, add in the new event annotation
                ann_obj.add_annotation(eann)

                mapped += 1

            if options.verbose:
                print(mapped, 'mapped in', fn, file=sys.stderr)

    except annotation.AnnotationFileNotFoundError:
        print("%s:\tFailed: file not found" % fn, file=sys.stderr)
    except annotation.AnnotationNotFoundError as e:
        print("%s:\tFailed: %s" % (fn, e), file=sys.stderr)
Ejemplo n.º 10
0
def ent2event(anntype, fn):
    global options

    mapped = 0

    try:
        
        nosuff_fn = fn.replace(".ann","")

        with annotation.TextAnnotations(nosuff_fn) as ann_obj:

            for ann in ann_obj.get_entities():
                if ann.type != anntype:
                    
                    continue

                

                
                
                new_id = ann_obj.get_new_id('E')
                eann = annotation.EventAnnotation(ann.id, [], new_id, ann.type, '')            

                
                
                
                for e in ann_obj.get_events():
                    for i in range(0, len(e.args)):
                        role, argid = e.args[i]
                        if argid == ann.id:
                            
                            argid = new_id
                            e.args[i] = role, argid
                for c in ann_obj.get_oneline_comments():
                    if c.target == ann.id:
                        
                        c.target = new_id

                
                ann_obj.add_annotation(eann)

                mapped += 1

            if options.verbose:
                print >> sys.stderr, mapped, 'mapped in', fn

    except annotation.AnnotationFileNotFoundError:
        print >> sys.stderr, "%s:\tFailed: file not found" % fn
    except annotation.AnnotationNotFoundError, e:
        print >> sys.stderr, "%s:\tFailed: %s" % (fn, e)
Ejemplo n.º 11
0
def verify(args):
    """Verify that there are no contested annotations remaining in 'args.correction_dir'."""
    identifiers = ai2_common.get_identifiers(args.correction_dir)

    not_finished = []
    for identifier in identifiers:
        unresolved = []
        brat = annotation.TextAnnotations(identifier)
        for a in brat:
            if isinstance(a, annotation.TypedAnnotation):
                if is_annotation_contested(a):
                    unresolved.append(a)
        logging.debug("{} has {} unresolved conflicts".format(
            identifier, len(unresolved)))
        if unresolved:
            not_finished.append((identifier, unresolved))

    if not_finished:
        logging.warn("{} files with unresolved annotations".format(
            len(not_finished)))
        sys.exit(1)
Ejemplo n.º 12
0
def display_discontinuous(files, verbose):
    total_entities = 0
    discontinuous_entity_counts = []
    for f in files:
        text_annotation = annotation.TextAnnotations(f)
        entity_count = len(list(text_annotation.get_entities()))
        discontinuous = find_discontinuous(text_annotation)
        discontinuous_count = len(discontinuous)
        total_entities += entity_count
        discontinuous_entity_counts.append(discontinuous_count)
        if verbose:
            if total_entities == 0 or discontinuous_count == 0:
                continue
            print("\n" + "-" * 60)
            print("{} discontinous entities out of {} in {}".format(
                discontinuous_count, entity_count, f))
            for e in discontinuous:
                print_entity_mention(e, text_annotation.get_document_text())

    total_discontinuous_entities = sum(discontinuous_entity_counts)
    print("Total number of entities: {}".format(total_entities))
    print("Total number of discontinous entites: {}".format(
        total_discontinuous_entities))
Ejemplo n.º 13
0
def __filenames_to_annotations(filenames):
    """
    Given file names, returns corresponding Annotations objects.
    """
    
    # TODO: error output should be done via messager to allow
    # both command-line and GUI invocations

    global REPORT_SEARCH_TIMINGS
    if REPORT_SEARCH_TIMINGS:
        process_start = datetime.now()

    anns = []
    for fn in filenames:
        try:
            # remove suffixes for Annotations to prompt parsing of .a1
            # also.
            nosuff_fn = fn.replace(".ann","").replace(".a2","").replace(".rel","")
            ann_obj = annotation.TextAnnotations(nosuff_fn, read_only=True)
            anns.append(ann_obj)
        except annotation.AnnotationFileNotFoundError:
            print >> sys.stderr, "%s:\tFailed: file not found" % fn
        except annotation.AnnotationNotFoundError, e:
            print >> sys.stderr, "%s:\tFailed: %s" % (fn, e)
Ejemplo n.º 14
0
def display_overlapping(files, verbose):
    total_entities = 0
    overlapping_entity_counts = []
    for f in files:
        text_annotation = annotation.TextAnnotations(f)
        entity_count = len(list(text_annotation.get_entities()))
        overlapping = find_overlapping(text_annotation)
        overlapping_count = len(overlapping)
        total_entities += entity_count
        overlapping_entity_counts.append(overlapping_count)
        if verbose:
            if total_entities == 0 or overlapping_count == 0:
                continue
            print("\n" + "-" * 60)
            print("{} overlapping pairs out of {} entities in {}".format(
                overlapping_count, entity_count, f))
            for e in overlapping:
                print_overlapping_entity_mentions(
                    e[0], e[1], text_annotation.get_document_text())

    total_overlapping_entities = sum(overlapping_entity_counts)
    print("Total number of entities: {}".format(total_entities))
    print("Total number of overlapping entites: {}".format(
        total_overlapping_entities))
Ejemplo n.º 15
0

def name_without_extension(file_name):
    import re
    return re.sub(EXTENSIONS_RE, '', file_name)


def copy_annotations(original_name, new_name):
    import shutil
    for extension in KNOWN_FILE_SUFF:
        try:
            shutil.copyfile('%s.%s' % (original_name, extension),
                            '%s.%s' % (new_name, extension))
        except IOError, e:
            pass  # that extension file does not exist
    return annotation.TextAnnotations(new_name)


def delete_annotations(name):
    bare_name = name_without_extension(name)
    for extension in KNOWN_FILE_SUFF:
        try:
            os.remove('%s.%s' % (name, extension))
        except OSError, e:
            pass  # that extension file does not exist


def diff_files(first_name, second_name, result_name):
    first_bare = name_without_extension(first_name)
    second_bare = name_without_extension(second_name)
    result_bare = name_without_extension(result_name)
Ejemplo n.º 16
0
def merge_annotations(identifier, correction_dir, annotator_dirs):
    """Combines the brat annotations for 'identifier' from each dir in 'annotator_dirs'.

    Overwrites any existing file in 'correction_dir'.

    Works according to the scheme laid out in:
    docs.google.com/document/d/1zj5WAAykZfrPJwaKtv-AUD0m9BrVH6ybcl17PunnIgc

    It is a significant invariant that there will only be one entity with any
    given set of spans.

    """
    annotator_brats = get_annotator_brats(annotator_dirs, identifier)
    annotators = annotator_brats.keys()
    brats = annotator_brats.values()
    correction_file = create_correction_file(identifier, correction_dir,
                                             annotator_dirs[0])
    corrected = annotation.TextAnnotations(
        os.path.join(correction_dir, identifier))

    all_entities = itertools.chain.from_iterable(
        (((e, b) for e in b.get_entities()) for b in brats))
    accounted_for = set()
    no_perfect_match = []

    # Entities with perfect span matches
    for (entity, from_brat) in all_entities:
        if entity in accounted_for:
            continue
        matches = get_entity_matches(entity, brats)
        accounted_for.update(set(matches))
        if len(matches) < len(annotators):
            no_perfect_match.append((entity, from_brat))
        else:
            ann = suffix_annotation_id(get_annotator(from_brat), entity)
            types = set((e.type for e in matches))
            if len(types) > 1:
                # Type of the entity is contested
                ann = set_annotation_type(ann, "FIX_TYPE")
            corrected.add_annotation(ann)

    for (entity, from_brat) in no_perfect_match:
        id_prefixed = suffix_annotation_id(get_annotator(from_brat), entity)
        if len(get_entity_overlaps(entity, brats)) > 1:
            # With some overlap (other than itself)
            ann = prefix_annotation_type(id_prefixed, "FIX_SPAN_")
        else:
            # With no overlap
            ann = prefix_annotation_type(id_prefixed, "VERIFY_")
        corrected.add_annotation(ann)

    # Transfer comments on entities
    for entity in corrected.get_entities():
        transfer_comments(corrected, entity, brats)

    all_relations = itertools.chain.from_iterable(
        (((r, b) for r in b.get_relations()) for b in brats))
    accounted_for = set()
    no_perfect_match = []

    # Relations for which the arguments have perfect span matches
    for (relation, from_brat) in all_relations:
        if relation in accounted_for:
            continue
        matches = get_relation_matches(relation, from_brat, brats)
        accounted_for.update(set(matches))
        if len(matches) < len(annotators):
            no_perfect_match.append((relation, from_brat))
        else:
            # Relation needs to refer to entities in the new set
            ann = translate_relation(relation, from_brat, corrected)
            ann = suffix_annotation_id(get_annotator(from_brat), ann)
            types = set((r.type for r in matches))
            if len(types) > 1:
                # Type of the relation is contested
                ann = set_annotation_type(ann, "FIX_RELATION_TYPE")
            corrected.add_annotation(ann)

    for (relation, from_brat) in no_perfect_match:
        ann = prefix_annotation_type(
            suffix_annotation_id(
                get_annotator(from_brat),
                translate_relation(relation, from_brat, corrected)), "VERIFY_")
        corrected.add_annotation(ann)

    with codecs.open(correction_file, mode="w",
                     encoding="utf-8") as outputFile:
        outputFile.write(str(corrected))