Python TextAnnotations Examples, annotation.TextAnnotations Python Examples

Example #1

0

Show file

File: tag.py Project: ingridan/brat

def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    doc_path = path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from httplib import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + ('?' + url_soup.query
                                           if url_soup.query else '')
            try:
                data = ann_obj.get_document_text().encode('utf-8')
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request(
                    'POST',
                    # As per: http://bugs.python.org/issue11898
                    # Force the url to be an ascii string
                    str(url_soup.path),
                    data,
                    headers=req_headers)
            except SocketError, e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:

Example #2

0

Show file

def create_comment(collection, document, id, comment=None):
    directory = collection
    undo_resp = {}

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)

    projectconf = ProjectConfiguration(real_dir)

    txt_file_path = document + '.' + TEXT_FILE_SUFFIX

    # XXX what is this doing here?
    # path_split(document)[0]

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()

        _set_special_comments(ann_obj, id, comment, mods, undo_resp=undo_resp)

        mods_json = mods.json_response()
        if undo_resp:
            mods_json['undo'] = json_dumps(undo_resp)
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json

Example #3

0

Show file

File: annotator.py Project: zhaurora/brat

def create_arc(collection, document, origin, target, type, attributes=None,
               old_type=None, old_target=None, comment=None):
    directory = collection
    undo_resp = {}

    real_dir = real_directory(directory)

    mods = ModificationTracker()

    projectconf = ProjectConfiguration(real_dir)

    document = path_join(real_dir, document)

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        # TODO: make consistent across the different editing
        # functions, integrate ann_obj initialization and checks
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        origin = ann_obj.get_ann_by_id(origin)
        target = ann_obj.get_ann_by_id(target)

        # if there is a previous annotation and the arcs aren't in
        # the same category (e.g. relation vs. event arg), process
        # as delete + create instead of update.
        if old_type is not None and (
                projectconf.is_relation_type(old_type) !=
                projectconf.is_relation_type(type) or
                projectconf.is_equiv_type(old_type) !=
                projectconf.is_equiv_type(type)):
            _delete_arc_with_ann(origin.id, old_target, old_type, mods,
                                 ann_obj, projectconf)
            old_target, old_type = None, None

        if projectconf.is_equiv_type(type):
            ann = _create_equiv(ann_obj, projectconf, mods, origin, target,
                                type, attributes, old_type, old_target)

        elif projectconf.is_relation_type(type):
            ann = _create_relation(ann_obj, projectconf, mods, origin, target,
                                   type, attributes, old_type, old_target)
        else:
            ann = _create_argument(ann_obj, projectconf, mods, origin, target,
                                   type, attributes, old_type, old_target)

        # process comments
        if ann is not None:
            _set_comments(ann_obj, ann, comment, mods,
                          undo_resp=undo_resp)
        elif comment is not None:
            Messager.warning(
                'create_arc: non-empty comment for None annotation (unsupported type for comment?)')

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json

Example #4

0

Show file

File: ApplicationScope.py Project: proycon/brat

def getAnnObject2(collection,document):
    '''newest version of the getAnnObject methode'''
    try:
        from os.path import join as path_join
        from document import real_directory
        real_dir = real_directory(collection)
    except:
        real_dir=collection      
    app_path = WORK_DIR + "/application/"
    ann = None
    full_name = collection + document
    full_name = full_name.replace("/","")
    if( isfile(app_path+full_name)):
        temp=open (app_path+full_name , 'rb')
        ann = pickle_load(temp)
        temp.close()
    else:
        ann = TextAnnotations(real_dir+document)
        ann = SimpleAnnotations(ann)
        ann.folia = {}
        try:
            #TODO:good error message
            ann.folia=get_extra_info(collection,document)
        except Exception as e:
            ann.folia = {}
            Messager.error('Error: get extra folia info() failed: %s' % e)
    #Validation:
    try:
        import os
        import simplejson as json
        import session
        docdir = os.path.dirname(ann._document)
        string = session.load_conf()["config"]
        val = json.loads(string)["validationOn"]
        #validate if config enables it and if it's not already done.
        if val:
            if not ann.validated:    
                from verify_annotations import verify_annotation
                projectconf = ProjectConfiguration(docdir)
                issues = []
                issues = verify_annotation(ann, projectconf)
            else:
                issues = ann.issues
        else:
            ann.validated = False
            issues = []
    except session.NoSessionError:
        issues = []
    except KeyError:
        issues = []
    except Exception as e:
        # TODO add an issue about the failure?
        issues = []
    ann.issues = issues
    temp=open (app_path+full_name , 'wb')    
    pickle_dump(ann, temp)
    temp.close()
    return ann

Example #5

0

Show file

File: ApplicationScope.py Project: desmetb/brat

def getAnnObject(collection, document):
    try:
        real_dir = real_directory(collection)
    except:
        real_dir = collection
    app_path = WORK_DIR + "/application/"
    full_name = collection + document
    full_name = full_name.replace("/", "")
    if (os.path.isfile(app_path + full_name)):
        temp = open(app_path + full_name, 'rb')
        ann = pickle_load(temp)
        temp.close()
    else:
        ann = TextAnnotations(real_dir + document)
        ann = SimpleAnnotations(ann)
        ann.folia = {}
        try:
            #TODO:good error message
            ann.folia = get_extra_info(collection, document)
        except Exception as e:
            ann.folia = {}
            Messager.error('Error: get extra folia info() failed: %s' % e)
    #Validation:
    try:
        docdir = os.path.dirname(ann._document)
        string = session.load_conf()["config"]
        val = json.loads(string)["validationOn"]
        #validate if config enables it and if it's not already done.
        if val:
            if not ann.validated:
                projectconf = ProjectConfiguration(docdir)
                issues = verify_annotation(ann, projectconf)
            else:
                issues = ann.issues
        else:
            ann.validated = False
            issues = []
    except session.NoSessionError:
        issues = []
    except KeyError:
        issues = []
    except Exception as e:
        # TODO add an issue about the failure?
        issues = []
        Messager.error('Error: validation failed: %s' % e)
    ann.issues = issues
    temp = open(app_path + full_name, 'wb')
    pickle_dump(ann, temp)
    temp.close()
    return ann

Example #6

0

Show file

File: annotator.py Project: WeSIG/Delta

def get_status(directory, document):
    with TextAnnotations(path_join(real_directory, document),
                         read_only=True) as ann:

        # XXX: Assume the last one is correct if we have more
        #       than one (which is a violation of protocol anyway)
        statuses = [c for c in ann.get_statuses()]
        if statuses:
            status = statuses[-1].target
        else:
            status = None

    json_dic = {'status': status}
    return json_dic

Example #7

0

Show file

def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    doc_path = path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
        elif url_soup.scheme == 'https':
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + ('?' + url_soup.query
                                           if url_soup.query else '')
            try:
                conn.request(
                    'POST',
                    url_soup.path,
                    # The document text as body
                    ann_obj.get_document_text().encode('utf8'),
                    headers=req_headers)
            except SocketError, e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
        finally:

Example #8

0

Show file

File: document.py Project: WeSIG/Delta

def _document_json_dict(document):
    # TODO: DOC!

    # pointing at directory instead of document?
    if isdir(document):
        raise IsDirectoryError(document)

    j_dic = {}
    _enrich_json_with_base(j_dic)

    # TODO: We don't check if the files exist, let's be more error friendly
    # Read in the textual data to make it ready to push
    _enrich_json_with_text(j_dic, document + '.' + TEXT_FILE_SUFFIX)
    # _enrich_json_with_xml(j_dic, document + '.xml')

    # 文档标注的位置获取
    with TextAnnotations(document) as ann_obj:
        # Note: At this stage the sentence offsets can conflict with the
        #   annotations, we thus merge any sentence offsets that lie within
        #   annotations
        # XXX: ~O(tb_ann * sentence_breaks), can be optimised
        # XXX: The merge strategy can lead to unforeseen consequences if two
        #   sentences are not adjacent (the format allows for this:
        #   S_1: [0, 10], S_2: [15, 20])
        s_breaks = j_dic['sentence_offsets']
        for tb_ann in ann_obj.get_textbounds():
            s_i = 0
            while s_i < len(s_breaks):
                s_start, s_end = s_breaks[s_i]
                # Does any subspan of the annotation strech over the
                # end of the sentence?
                found_spanning = False
                for tb_start, tb_end in tb_ann.spans:
                    if tb_start < s_end and tb_end > s_end:
                        found_spanning = True
                        break
                if found_spanning:
                    # Merge this sentence and the next sentence
                    s_breaks[s_i] = (s_start, s_breaks[s_i + 1][1])
                    del s_breaks[s_i + 1]
                else:
                    s_i += 1

        _enrich_json_with_data(j_dic, ann_obj)

    return j_dic

Example #9

0

Show file

File: annotator.py Project: WeSIG/Delta

def set_status(directory, document, status=None):
    real_dir = real_directory(directory)

    with TextAnnotations(path_join(real_dir, document)) as ann:
        # Erase all old status annotations
        for status in ann.get_statuses():
            ann.del_annotation(status)

        if status is not None:
            # XXX: This could work, not sure if it can induce an id collision
            new_status_id = ann.get_new_id('#')
            ann.add_annotation(
                OnelineCommentAnnotation(new_status, new_status_id, 'STATUS',
                                         ''))

    json_dic = {'status': new_status}
    return json_dic

Example #10

0

Show file

File: document.py Project: quantiply/stav

def _document_json_dict(document):
    #TODO: DOC!

    # pointing at directory instead of document?
    if isdir(document):
        raise IsDirectoryError(document)

    j_dic = {}
    _enrich_json_with_base(j_dic)

    #TODO: We don't check if the files exist, let's be more error friendly
    # Read in the textual data to make it ready to push
    _enrich_json_with_text(j_dic, document + '.' + TEXT_FILE_SUFFIX)

    with TextAnnotations(document) as ann_obj:
        _enrich_json_with_data(j_dic, ann_obj)

    return j_dic

Example #11

0

Show file

def get_doc(tokens, text):
    doc = TextAnnotations(text=text)
    accu = []
    for token in tokens:
        tag = token[1]
        if tag in "BSO" and accu:
            make_annotation(doc, accu)
            accu = []
        if tag in "BESI":
            accu.append(token)
        if tag in "ES":
            make_annotation(doc, accu)
            accu = []

    if accu:
        make_annotation(doc, accu)

    return doc

Example #12

0

Show file

File: annotator.py Project: zhaurora/brat

def delete_arc(collection, document, origin, target, type):
    directory = collection

    real_dir = real_directory(directory)

    mods = ModificationTracker()

    projectconf = ProjectConfiguration(real_dir)

    document = path_join(real_dir, document)

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        _delete_arc_with_ann(origin, target, type, mods, ann_obj, projectconf)

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json

Example #13

0

Show file

File: annotator.py Project: edycop/brat

def delete_span(collection, document, id):
    directory = collection

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)
    
    txt_file_path = document + '.' + TEXT_FILE_SUFFIX

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only 
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()
        
        #TODO: Handle a failure to find it
        #XXX: Slow, O(2N)
        ann = ann_obj.get_ann_by_id(id)
        try:
            # Note: need to pass the tracker to del_annotation to track
            # recursive deletes. TODO: make usage consistent.
            ann_obj.del_annotation(ann, mods)
            try:
                trig = ann_obj.get_ann_by_id(ann.trigger)
                try:
                    ann_obj.del_annotation(trig, mods)
                except DependingAnnotationDeleteError:
                    # Someone else depended on that trigger
                    pass
            except AttributeError:
                pass
        except DependingAnnotationDeleteError, e:
            Messager.error(e.html_error_str())
            return {
                    'exception': True,
                    }

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json

Example #14

0

Show file

File: folia2brat.py Project: desmetb/brat

def convert(path, fname):
    from message import Messager
    try:
        from os.path import join as path_join
        from document import real_directory
        real_dir = real_directory(path)
    except:
        real_dir = path
    full_path = path_join(real_dir, fname)
    entity_ids = {}
    try:
        doc = folia.Document(file=full_path + ".xml")
        temp = open(full_path + ".ann", 'w')
        txt = open(full_path + ".txt", 'w')
    except IOError as e:
        Messager.error("IOError " + str(e))
        return {
            'result': False,
        }
    ann_obj = TextAnnotations(full_path)
    text, offsets = parse_text(doc)
    with SimpleAnnotations(ann_obj) as ann:
        add_entities(doc, ann, entity_ids, offsets, text)
        add_relations(doc, ann, entity_ids, offsets)
        add_attributes(doc, ann, entity_ids)
        add_comments(doc, ann, entity_ids)
        try:
            ann.folia = get_extra_info(path, fname)
        except:
            Messager.error("get_extra_info() from folia failed")
            ann.folia = {}
    txt.write(text)
    txt.close()
    #~ temp.write(str(ann))
    #~ temp.close()
    make_conf_file(real_dir, ann)
    #return is needed for client, so it can see the function is done, this can take a few seconds
    return {
        'result': True,
    }

Example #15

0

Show file

def reverse_arc(collection, document, origin, target, type, attributes=None):
    directory = collection
    # undo_resp = {} # TODO
    real_dir = real_directory(directory)
    # mods = ModificationTracker() # TODO
    projectconf = ProjectConfiguration(real_dir)

    document = urllib.parse.unquote(document)
    document = path_join(real_dir, document)
    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        if projectconf.is_equiv_type(type):
            Messager.warning('Cannot reverse Equiv arc')
        elif not projectconf.is_relation_type(type):
            Messager.warning('Can only reverse configured binary relations')
        else:
            # OK to reverse
            found = None
            # TODO: more sensible lookup
            for ann in ann_obj.get_relations():
                if (ann.arg1 == origin and ann.arg2 == target
                        and ann.type == type):
                    found = ann
                    break
            if found is None:
                Messager.error(
                    'reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)'
                    % (str(origin), str(target), str(type)))
            else:
                # found it; just adjust this
                found.arg1, found.arg2 = found.arg2, found.arg1
                # TODO: modification tracker

        json_response = {}
        json_response['annotations'] = _json_from_ann(ann_obj)
        return json_response

Example #16

0

Show file

File: brat2folia.py Project: desmetb/brat

        try:
            found=fdoc[attr.target]
            if  not any(fattr.cls == str(attr.value) and fattr.subset == attr.type for fattr in found.select(folia.Feature)) :
                print "error: not found attr"
                print attr
                print
                return False
        except KeyError:
            print "error: not found attr"
            print rel
            return False

    print "file "+path+doc+" is OK"
    return True


if __name__ == '__main__':
    #convert("/home/sander/Documenten/Masterproef/pythontest/","folia")
    convert("/home/sander/Documents/Masterproef-v2/brat/data/brat_vb/sentiment/","sporza")
    ann = TextAnnotations("/home/sander/Documents/Masterproef-v2/brat/data/brat_vb/sentiment/sporza")
    #print ann._document_text
    #~ compare("/home/sander/Downloads/brat/data/testen/sentiment/","detijd_other_Bekaert_12-05-05")
    #~ compare("/home/sander/Downloads/brat/data/testen/sentiment/","detijd_other_bedrijfMulti_06-05-05")
    #~ compare("/home/sander/Downloads/brat/data/testen/vertaling/","1722_all_+DM")
    #~ compare("/home/sander/Downloads/brat/data/testen/sentiment/","testdoc")
    print "saved"
    #folia.validate("/home/sander/Documenten/Masterproef/pythontest/detijd_other_Bekaert_12-05-05.xml",deep="true")

Example #17

0

Show file

File: annotator.py Project: edycop/brat

def delete_arc(collection, document, origin, target, type):
    directory = collection

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)

    txt_file_path = document + '.' + TEXT_FILE_SUFFIX

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only 
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()

        # This can be an event or an equiv
        #TODO: Check for None!
        try:
            event_ann = ann_obj.get_ann_by_id(origin)
            # Try if it is an event
            arg_tup = (type, unicode(target))
            if arg_tup in event_ann.args:
                before = unicode(event_ann)
                event_ann.args.remove(arg_tup)
                mods.change(before, event_ann)

                '''
                if not event_ann.args:
                    # It was the last argument tuple, remove it all
                    try:
                        ann_obj.del_annotation(event_ann)
                        mods.deletion(event_ann)
                    except DependingAnnotationDeleteError, e:
                        #XXX: Old message api
                        print 'Content-Type: application/json\n'
                        print dumps(e.json_error_response())
                        return
                '''
            else:
                # What we were to remove did not even exist in the first place
                pass

        except AttributeError:
            projectconf = ProjectConfiguration(real_dir)
            if projectconf.is_equiv_type(type):
                # It is an equiv then?
                #XXX: Slow hack! Should have a better accessor! O(eq_ann)
                for eq_ann in ann_obj.get_equivs():
                    # We don't assume that the ids only occur in one Equiv, we
                    # keep on going since the data "could" be corrupted
                    if (unicode(origin) in eq_ann.entities
                            and unicode(target) in eq_ann.entities):
                        before = unicode(eq_ann)
                        eq_ann.entities.remove(unicode(origin))
                        eq_ann.entities.remove(unicode(target))
                        mods.change(before, eq_ann)

                    if len(eq_ann.entities) < 2:
                        # We need to delete this one
                        try:
                            ann_obj.del_annotation(eq_ann)
                            mods.deletion(eq_ann)
                        except DependingAnnotationDeleteError, e:
                            #TODO: This should never happen, dep on equiv
                            #print 'Content-Type: application/json\n'
                            # TODO: Proper exception here!
                            Messager.error(e.json_error_response())
                            return {}
            elif type in projectconf.get_relation_types():
                for ann in ann_obj.get_relations():
                    if ann.type == type and ann.arg1 == origin and ann.arg2 == target:
                        ann_obj.del_annotation(ann)
                        mods.deletion(ann)
                        break
            else:

Example #18

0

Show file

File: annotator.py Project: zhaurora/brat

def _create_span(collection, document, offsets, _type, attributes=None,
                 normalizations=None, _id=None, comment=None):

    if _offset_overlaps(offsets):
        raise SpanOffsetOverlapError(offsets)

    directory = collection
    undo_resp = {}

    _attributes = _parse_attributes(attributes)
    _normalizations = _parse_span_normalizations(normalizations)

    #log_info('ATTR: %s' %(_attributes, ))

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)

    projectconf = ProjectConfiguration(real_dir)

    txt_file_path = document + '.' + TEXT_FILE_SUFFIX

    path_split(document)[0]

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()

        if _id is not None:
            # We are to edit an existing annotation
            tb_ann, e_ann = _edit_span(ann_obj, mods, _id, offsets, projectconf,
                                       _attributes, _type, undo_resp=undo_resp)
        else:
            # We are to create a new annotation
            tb_ann, e_ann = __create_span(
                ann_obj, mods, _type, offsets, txt_file_path, projectconf, _attributes)

            undo_resp['action'] = 'add_tb'
            if e_ann is not None:
                undo_resp['id'] = e_ann.id
            else:
                undo_resp['id'] = tb_ann.id

        # Determine which annotation attributes, normalizations,
        # comments etc. should be attached to. If there's an event,
        # attach to that; otherwise attach to the textbound.
        if e_ann is not None:
            # Assign to the event, not the trigger
            target_ann = e_ann
        else:
            target_ann = tb_ann

        # Set attributes
        _set_attributes(ann_obj, target_ann, _attributes, mods,
                        undo_resp=undo_resp)

        # Set normalizations
        _set_normalizations(ann_obj, target_ann, _normalizations, mods,
                            undo_resp=undo_resp)

        # Set comments
        if tb_ann is not None:
            _set_comments(ann_obj, target_ann, comment, mods,
                          undo_resp=undo_resp)

        if tb_ann is not None:
            mods_json = mods.json_response()
        else:
            # Hack, probably we had a new-line in the span
            mods_json = {}
            Messager.error(
                'Text span contained new-line, rejected',
                duration=3)

        if undo_resp:
            mods_json['undo'] = json_dumps(undo_resp)
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json

Example #19

0

Show file

File: annotator.py Project: zhaurora/brat

def split_span(collection, document, args, id):
    directory = collection

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)
    # TODO don't know how to pass an array directly, so doing extra catenate
    # and split
    tosplit_args = json_loads(args)

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()

        ann = ann_obj.get_ann_by_id(id)

        # currently only allowing splits for events
        if not isinstance(ann, EventAnnotation):
            raise AnnotationSplitError(
                "Cannot split an annotation of type %s" %
                ann.type)

        # group event arguments into ones that will be split on and
        # ones that will not, placing the former into a dict keyed by
        # the argument without trailing numbers (e.g. "Theme1" ->
        # "Theme") and the latter in a straight list.
        split_args = {}
        nonsplit_args = []
        import re
        for arg, aid in ann.args:
            m = re.match(r'^(.*?)\d*$', arg)
            if m:
                arg = m.group(1)
            if arg in tosplit_args:
                if arg not in split_args:
                    split_args[arg] = []
                split_args[arg].append(aid)
            else:
                nonsplit_args.append((arg, aid))

        # verify that split is possible
        for a in tosplit_args:
            acount = len(split_args.get(a, []))
            if acount < 2:
                raise AnnotationSplitError(
                    "Cannot split %s on %s: only %d %s arguments (need two or more)" %
                    (ann.id, a, acount, a))

        # create all combinations of the args on which to split
        argument_combos = [[]]
        for a in tosplit_args:
            new_combos = []
            for aid in split_args[a]:
                for c in argument_combos:
                    new_combos.append(c + [(a, aid)])
            argument_combos = new_combos

        # create the new events (first combo will use the existing event)
        from copy import deepcopy
        new_events = []
        for i, arg_combo in enumerate(argument_combos):
            # tweak args
            if i == 0:
                ann.args = nonsplit_args[:] + arg_combo
            else:
                newann = deepcopy(ann)
                # TODO: avoid hard-coding ID prefix
                newann.id = ann_obj.get_new_id("E")
                newann.args = nonsplit_args[:] + arg_combo
                ann_obj.add_annotation(newann)
                new_events.append(newann)
                mods.addition(newann)

        # then, go through all the annotations referencing the original
        # event, and create appropriate copies
        for a in ann_obj:
            soft_deps, hard_deps = a.get_deps()
            refs = soft_deps | hard_deps
            if ann.id in refs:
                # Referenced; make duplicates appropriately

                if isinstance(a, EventAnnotation):
                    # go through args and make copies for referencing
                    new_args = []
                    for arg, aid in a.args:
                        if aid == ann.id:
                            for newe in new_events:
                                new_args.append((arg, newe.id))
                    a.args.extend(new_args)

                elif isinstance(a, AttributeAnnotation):
                    for newe in new_events:
                        newmod = deepcopy(a)
                        newmod.target = newe.id
                        # TODO: avoid hard-coding ID prefix
                        newmod.id = ann_obj.get_new_id("A")
                        ann_obj.add_annotation(newmod)
                        mods.addition(newmod)

                elif isinstance(a, BinaryRelationAnnotation):
                    # TODO
                    raise AnnotationSplitError(
                        "Cannot adjust annotation referencing split: not implemented for relations! (WARNING: annotations may be in inconsistent state, please reload!) (Please complain to the developers to fix this!)")

                elif isinstance(a, OnelineCommentAnnotation):
                    for newe in new_events:
                        newcomm = deepcopy(a)
                        newcomm.target = newe.id
                        # TODO: avoid hard-coding ID prefix
                        newcomm.id = ann_obj.get_new_id("#")
                        ann_obj.add_annotation(newcomm)
                        mods.addition(newcomm)
                elif isinstance(a, NormalizationAnnotation):
                    for newe in new_events:
                        newnorm = deepcopy(a)
                        newnorm.target = newe.id
                        # TODO: avoid hard-coding ID prefix
                        newnorm.id = ann_obj.get_new_id("N")
                        ann_obj.add_annotation(newnorm)
                        mods.addition(newnorm)
                else:
                    raise AnnotationSplitError(
                        "Cannot adjust annotation referencing split: not implemented for %s! (Please complain to the lazy developers to fix this!)" %
                        a.__class__)

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json

Example #20

0

Show file

File: tag.py Project: WeSIG/Delta

def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    print("tagger", tagger, file=sys.stderr)
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    path_join(real_directory(collection), document)

    # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr)
    # print("tagger_token", tagger_token, file=sys.stderr)
    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:
        # print("ann_obj", document, file=sys.stderr)

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
            # print("HTTPConnection", HTTPConnection, file=sys.stderr)
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from http.client import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + ('?' + url_soup.query
                                           if url_soup.query else '')
            try:
                # Note: Trout slapping for anyone sending Unicode objects here

                data = str(path_join(
                    real_directory(collection),
                    document)) + "#*^$#" + ann_obj.get_document_text()
                data = data.encode('utf-8')
                # print("data", type(data),data, file=sys.stderr)
                # print("data", ann_obj, file=sys.stderr)
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request(
                    'POST',
                    # As per: http://bugs.python.org/issue11898
                    # Force the url to be an ascii string
                    str(service_url),
                    data,
                    headers=req_headers)
                # httpConnection = http.client.HTTPConnection(url_soup.netloc)
                # httpConnection.request('GET', str(service_url), headers=req_headers)
                # response = httpConnection.getresponse()

            except SocketError as e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()
            # print("resp-------------", resp.read(), file=sys.stderr)

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
            if conn is not None:
                conn.close()

        try:
            json_resp = loads(resp_data)
            # print("json_resp", json_resp, file=sys.stderr)
        except ValueError:
            raise InvalidTaggerResponseError(tagger_token, resp_data)

        mods = ModificationTracker()
        cidmap = {}

        # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr)
        for cid, ann in ((i, a) for i, a in json_resp.items()
                         if _is_textbound(a)):
            assert 'offsets' in ann, 'Tagger response lacks offsets'
            offsets = ann['offsets']
            # print("json_resp.items:::::::::::::", offsets, file=sys.stderr)
            assert 'type' in ann, 'Tagger response lacks type'
            _type = ann['type']
            assert 'texts' in ann, 'Tagger response lacks texts'
            texts = ann['texts']

            # sanity
            assert len(offsets) != 0, 'Tagger response has empty offsets'
            assert len(texts) == len(
                offsets
            ), 'Tagger response has different numbers of offsets and texts'

            start, end = offsets[0]
            text = texts[0]
            # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr)
            _id = ann_obj.get_new_id('T')
            print("_id", _id, file=sys.stderr)
            cidmap[cid] = _id

            tb = TextBoundAnnotationWithText(offsets, _id, _type, text,
                                             " " + ' '.join(texts[1:]))

            mods.addition(tb)
            ann_obj.add_annotation(tb)

        for norm in (a for a in json_resp.values() if _is_normalization(a)):
            try:
                _type = norm['type']
                target = norm['target']
                refdb = norm['refdb']
                refid = norm['refid']
            except KeyError as e:
                raise  # TODO

            _id = ann_obj.get_new_id('N')
            target = cidmap[target]

            na = NormalizationAnnotation(_id, _type, target, refdb, refid, '')

            mods.addition(na)
            ann_obj.add_annotation(na)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp