Beispiel #1
0
def _set_normalizations(ann_obj, ann, normalizations, mods, undo_resp={}):
    # Find existing normalizations (if any)
    existing_norm_anns = set(
        (a for a in ann_obj.get_normalizations() if a.target == ann.id))

    # Note the existing annotations for undo
    undo_resp['normalizations'] = json_dumps([(n.refdb, n.refid, n.reftext)
                                              for n in existing_norm_anns])

    # Organize into dictionaries for easier access
    old_norms = dict([((n.refdb, n.refid), n) for n in existing_norm_anns])
    new_norms = dict([((n[0], n[1]), n[2]) for n in normalizations])

    #Messager.info("Old norms: "+str(old_norms))
    #Messager.info("New norms: "+str(new_norms))

    # sanity check
    for refdb, refid, refstr in normalizations:
        # TODO: less aggressive failure
        assert refdb is not None and refdb.strip(
        ) != '', "Error: client sent empty norm DB"
        assert refid is not None and refid.strip(
        ) != '', "Error: client sent empty norm ID"
        # (the reference string is allwed to be empty)

    # Process deletions and updates of existing normalizations
    for old_norm_id, old_norm in list(old_norms.items()):
        if old_norm_id not in new_norms:
            # Delete IDs that were referenced previously but not anymore
            ann_obj.del_annotation(old_norm)
            mods.deletion(old_norm)
        else:
            # If the text value of the normalizations is different, update
            # (this shouldn't happen on a stable norm DB, but anyway)
            new_reftext = new_norms[old_norm_id]
            if old_norm.reftext != new_reftext:
                old = str(old_norm)
                old_norm.reftext = new_reftext
                mods.change(old, old_norm)

    # Process new normalizations
    for new_norm_id, new_reftext in list(new_norms.items()):
        if new_norm_id not in old_norms:
            new_id = ann_obj.get_new_id('N')
            # TODO: avoid magic string value
            norm_type = 'Reference'
            new_norm = NormalizationAnnotation(new_id, norm_type, ann.id,
                                               new_norm_id[0], new_norm_id[1],
                                               '\t' + new_reftext)
            ann_obj.add_annotation(new_norm)
            mods.addition(new_norm)
Beispiel #2
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    print("tagger", tagger, file=sys.stderr)
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    path_join(real_directory(collection), document)

    # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr)
    # print("tagger_token", tagger_token, file=sys.stderr)
    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:
        # print("ann_obj", document, file=sys.stderr)

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
            # print("HTTPConnection", HTTPConnection, file=sys.stderr)
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from http.client import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + ('?' + url_soup.query
                                           if url_soup.query else '')
            try:
                # Note: Trout slapping for anyone sending Unicode objects here

                data = str(path_join(
                    real_directory(collection),
                    document)) + "#*^$#" + ann_obj.get_document_text()
                data = data.encode('utf-8')
                # print("data", type(data),data, file=sys.stderr)
                # print("data", ann_obj, file=sys.stderr)
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request(
                    'POST',
                    # As per: http://bugs.python.org/issue11898
                    # Force the url to be an ascii string
                    str(service_url),
                    data,
                    headers=req_headers)
                # httpConnection = http.client.HTTPConnection(url_soup.netloc)
                # httpConnection.request('GET', str(service_url), headers=req_headers)
                # response = httpConnection.getresponse()

            except SocketError as e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()
            # print("resp-------------", resp.read(), file=sys.stderr)

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
            if conn is not None:
                conn.close()

        try:
            json_resp = loads(resp_data)
            # print("json_resp", json_resp, file=sys.stderr)
        except ValueError:
            raise InvalidTaggerResponseError(tagger_token, resp_data)

        mods = ModificationTracker()
        cidmap = {}

        # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr)
        for cid, ann in ((i, a) for i, a in json_resp.items()
                         if _is_textbound(a)):
            assert 'offsets' in ann, 'Tagger response lacks offsets'
            offsets = ann['offsets']
            # print("json_resp.items:::::::::::::", offsets, file=sys.stderr)
            assert 'type' in ann, 'Tagger response lacks type'
            _type = ann['type']
            assert 'texts' in ann, 'Tagger response lacks texts'
            texts = ann['texts']

            # sanity
            assert len(offsets) != 0, 'Tagger response has empty offsets'
            assert len(texts) == len(
                offsets
            ), 'Tagger response has different numbers of offsets and texts'

            start, end = offsets[0]
            text = texts[0]
            # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr)
            _id = ann_obj.get_new_id('T')
            print("_id", _id, file=sys.stderr)
            cidmap[cid] = _id

            tb = TextBoundAnnotationWithText(offsets, _id, _type, text,
                                             " " + ' '.join(texts[1:]))

            mods.addition(tb)
            ann_obj.add_annotation(tb)

        for norm in (a for a in json_resp.values() if _is_normalization(a)):
            try:
                _type = norm['type']
                target = norm['target']
                refdb = norm['refdb']
                refid = norm['refid']
            except KeyError as e:
                raise  # TODO

            _id = ann_obj.get_new_id('N')
            target = cidmap[target]

            na = NormalizationAnnotation(_id, _type, target, refdb, refid, '')

            mods.addition(na)
            ann_obj.add_annotation(na)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp
Beispiel #3
0
            ann_obj.add_annotation(tb)

        for norm in (a for a in json_resp.itervalues()
                     if _is_normalization(a)):
            try:
                _type = norm['type']
                target = norm['target']
                refdb = norm['refdb']
                refid = norm['refid']
            except KeyError, e:
                raise  # TODO

            _id = ann_obj.get_new_id('N')
            target = cidmap[target]

            na = NormalizationAnnotation(_id, _type, target, refdb, refid, '')

            mods.addition(na)
            ann_obj.add_annotation(na)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp


def link(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for linker_token, _, _, linker_service_url in pconf.get_linker_config():
        if tagger == linker_token:
            break
    else: