Beispiel #1
0
def search_event(collection, document, scope="collection",
                 concordancing="false", context_length=50,
                 text_match="word", match_case="false",
                 type=None, trigger=DEFAULT_EMPTY_STRING, args={}):

    directory = collection

    # Interpret JSON booleans
    concordancing = _to_bool(concordancing)
    match_case = _to_bool(match_case)

    ann_objs = __doc_or_dir_to_annotations(directory, document, scope)

    restrict_types = []
    if type is not None and type != "":
        restrict_types.append(type)

    # to get around lack of JSON object parsing in dispatcher, parse
    # args here. 
    # TODO: parse JSON in dispatcher; this is far from the right place to do this..
    from jsonwrap import loads
    args = loads(args)

    matches = search_anns_for_event(ann_objs, trigger, args, 
                                    restrict_types=restrict_types,
                                    text_match=text_match, 
                                    match_case=match_case)

    results = format_results(matches, concordancing, context_length)
    results['collection'] = directory
    
    return results
Beispiel #2
0
def suggest_span_types(collection, document, start, end, text, model):

    pconf = ProjectConfiguration(real_directory(collection))
    for _, _, model_str, model_url in pconf.get_disambiguator_config():
        if model_str == model:
            break
    else:
        # We were unable to find a matching model
        raise SimSemConnectionNotConfiguredError

    try:
        quoted_text = quote_plus(text)
        resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT)
    except URLError:
        # TODO: Could give more details
        raise SimSemConnectionError

    json = loads(resp.read())

    preds = json['result'][text.decode('utf-8')]

    selected_preds = []
    conf_sum = 0
    for cat, conf in preds:
        selected_preds.append((
            cat,
            conf,
        ))
        conf_sum += conf
        if conf_sum >= CUT_OFF:
            break

    log_annotation(collection, document, 'DONE', 'suggestion', [
        None,
        None,
        text,
    ] + [
        selected_preds,
    ])

    # array so that server can control presentation order in UI
    # independently from scores if needed
    return {
        'types': selected_preds,
        'collection': collection,  # echo for reference
        'document': document,
        'start': start,
        'end': end,
        'text': text,
    }
Beispiel #3
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    doc_path = path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
            document)) as ann_obj:

        try:
            # Note: Can we actually fit a whole document in here?
            quoted_doc_text = quote_plus(ann_obj.get_document_text())
            resp = urlopen(tagger_service_url % quoted_doc_text, None,
                QUERY_TIMEOUT)
        except URLError:
            raise TaggerConnectionError(tagger_token)

        # TODO: Check for errors
        json_resp = loads(resp.read())

        mods = ModificationTracker()

        for ann_data in json_resp.itervalues():
            offsets = ann_data['offsets']
            # Note: We do not support discontinuous spans at this point
            assert len(offsets) == 1, 'discontinuous/null spans'
            start, end = offsets[0]
            _id = ann_obj.get_new_id('T')
            tb = TextBoundAnnotationWithText(
                    start, end,
                    _id,
                    ann_data['type'],
                    ann_data['text']
                    )
            mods.addition(tb)
            ann_obj.add_annotation(tb)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp
Beispiel #4
0
def save_web_page_import(url, docid, overwrite, collection=None):
    '''
    TODO: DOC:
    '''

    directory = collection

    if directory is None:
        dir_path = DATA_DIR
    else:
        #XXX: These "security" measures can surely be fooled
        if (directory.count('../') or directory == '..'):
            raise InvalidDirError(directory)

        dir_path = real_directory(directory)

    # Is the directory a directory and are we allowed to write?
    if not isdir(dir_path):
        raise InvalidDirError(dir_path)
    if not access(dir_path, W_OK):
        raise NoWritePermissionError(dir_path)

    base_path = join_path(dir_path, docid)
    txt_path = base_path + '.' + TEXT_FILE_SUFFIX
    ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF

    # Before we proceed, verify that we are not overwriting
    for path in (txt_path, ann_path):
        if isfile(path):
            if not overwrite or overwrite == 'false':
                raise FileExistsError(path)
            remove(path)

    apiUrl = 'http://api-ie.qna.bf2.yahoo.com:4080/ie_ws/v1/ie_ws?url=' + url
    data = getApiData(apiUrl)

    # location = join_path(dir_path, 'input.json')
    # data = getFileData(location)

    try:
        json_resp = loads(data)
    except ValueError, e:
        raise FormatError(apiUrl, e)
Beispiel #5
0
def suggest_span_types(collection, document, start, end, text, model):

    pconf = ProjectConfiguration(real_directory(collection))
    for _, _, model_str, model_url in pconf.get_disambiguator_config():
        if model_str == model:
            break
    else:
        # We were unable to find a matching model
        raise SimSemConnectionNotConfiguredError

    try:
        quoted_text = quote_plus(text)
        resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT)
    except URLError:
        # TODO: Could give more details
        raise SimSemConnectionError
    
    json = loads(resp.read())

    preds = json['result'][text.decode('utf-8')]

    selected_preds = []
    conf_sum = 0
    for cat, conf in preds:
        selected_preds.append((cat, conf, ))
        conf_sum += conf
        if conf_sum >= CUT_OFF:
            break

    log_annotation(collection, document, 'DONE', 'suggestion',
            [None, None, text, ] + [selected_preds, ])

    # array so that server can control presentation order in UI
    # independently from scores if needed
    return { 'types': selected_preds,
             'collection': collection, # echo for reference
             'document': document,
             'start': start,
             'end': end,
             'text': text,
             }
Beispiel #6
0
def search_event(collection, type=None, trigger=DEFAULT_EMPTY_STRING, args={}):
    directory = collection

    ann_objs = __directory_to_annotations(directory)

    restrict_types = []
    if type is not None and type != "":
        restrict_types.append(type)

    # to get around lack of JSON object parsing in dispatcher, parse
    # args here. 
    # TODO: parse JSON in dispatcher; this is far from the right place to do this..
    from jsonwrap import loads
    args = loads(args)

    matches = search_anns_for_event(ann_objs, trigger, args, restrict_types=restrict_types)

    results = format_results(matches)
    results['collection'] = directory
    
    return results
Beispiel #7
0
            except SocketError, e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(tagger_token,
                        '%s %s' % (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
            if conn is not None:
                conn.close()

        try:
            json_resp = loads(resp_data)
        except ValueError:
            raise InvalidTaggerResponseError(tagger_token, resp_data)

        mods = ModificationTracker()

        for ann_data in json_resp.itervalues():
            assert 'offsets' in ann_data, 'Tagger response lacks offsets'
            offsets = ann_data['offsets']
            assert 'type' in ann_data, 'Tagger response lacks type'
            _type = ann_data['type']
            assert 'texts' in ann_data, 'Tagger response lacks texts'
            texts = ann_data['texts']

            # sanity
            assert len(offsets) != 0, 'Tagger response has empty offsets'
Beispiel #8
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    print("tagger", tagger, file=sys.stderr)
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    path_join(real_directory(collection), document)

    # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr)
    # print("tagger_token", tagger_token, file=sys.stderr)
    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:
        # print("ann_obj", document, file=sys.stderr)

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
            # print("HTTPConnection", HTTPConnection, file=sys.stderr)
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from http.client import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + ('?' + url_soup.query
                                           if url_soup.query else '')
            try:
                # Note: Trout slapping for anyone sending Unicode objects here

                data = str(path_join(
                    real_directory(collection),
                    document)) + "#*^$#" + ann_obj.get_document_text()
                data = data.encode('utf-8')
                # print("data", type(data),data, file=sys.stderr)
                # print("data", ann_obj, file=sys.stderr)
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request(
                    'POST',
                    # As per: http://bugs.python.org/issue11898
                    # Force the url to be an ascii string
                    str(service_url),
                    data,
                    headers=req_headers)
                # httpConnection = http.client.HTTPConnection(url_soup.netloc)
                # httpConnection.request('GET', str(service_url), headers=req_headers)
                # response = httpConnection.getresponse()

            except SocketError as e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()
            # print("resp-------------", resp.read(), file=sys.stderr)

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
            if conn is not None:
                conn.close()

        try:
            json_resp = loads(resp_data)
            # print("json_resp", json_resp, file=sys.stderr)
        except ValueError:
            raise InvalidTaggerResponseError(tagger_token, resp_data)

        mods = ModificationTracker()
        cidmap = {}

        # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr)
        for cid, ann in ((i, a) for i, a in json_resp.items()
                         if _is_textbound(a)):
            assert 'offsets' in ann, 'Tagger response lacks offsets'
            offsets = ann['offsets']
            # print("json_resp.items:::::::::::::", offsets, file=sys.stderr)
            assert 'type' in ann, 'Tagger response lacks type'
            _type = ann['type']
            assert 'texts' in ann, 'Tagger response lacks texts'
            texts = ann['texts']

            # sanity
            assert len(offsets) != 0, 'Tagger response has empty offsets'
            assert len(texts) == len(
                offsets
            ), 'Tagger response has different numbers of offsets and texts'

            start, end = offsets[0]
            text = texts[0]
            # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr)
            _id = ann_obj.get_new_id('T')
            print("_id", _id, file=sys.stderr)
            cidmap[cid] = _id

            tb = TextBoundAnnotationWithText(offsets, _id, _type, text,
                                             " " + ' '.join(texts[1:]))

            mods.addition(tb)
            ann_obj.add_annotation(tb)

        for norm in (a for a in json_resp.values() if _is_normalization(a)):
            try:
                _type = norm['type']
                target = norm['target']
                refdb = norm['refdb']
                refid = norm['refid']
            except KeyError as e:
                raise  # TODO

            _id = ann_obj.get_new_id('N')
            target = cidmap[target]

            na = NormalizationAnnotation(_id, _type, target, refdb, refid, '')

            mods.addition(na)
            ann_obj.add_annotation(na)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp
Beispiel #9
0
                    headers=req_headers)
            except SocketError, e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
        finally:
            if conn is not None:
                conn.close()

        resp_data = resp.read()
        try:
            json_resp = loads(resp_data)
        except ValueError:
            raise InvalidTaggerResponseError(tagger_token, resp_data)

        mods = ModificationTracker()

        for ann_data in json_resp.itervalues():
            assert 'offsets' in ann_data, 'Tagger response lacks offsets'
            offsets = ann_data['offsets']
            assert 'type' in ann_data, 'Tagger response lacks type'
            _type = ann_data['type']
            assert 'texts' in ann_data, 'Tagger response lacks texts'
            texts = ann_data['texts']

            # sanity
            assert len(offsets) != 0, 'Tagger response has empty offsets'
Beispiel #10
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from http.client import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + (
                '?' + url_soup.query if url_soup.query else '')
            try:
                data = ann_obj.get_document_text().encode('utf-8')
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request('POST',
                             # As per: http://bugs.python.org/issue11898
                             # Force the url to be an ascii string
                             str(service_url),
                             data,
                             headers=req_headers)
            except SocketError as e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' %
                    (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
            if conn is not None:
                conn.close()

        try:
            json_resp = loads(resp_data)
        except ValueError:
            raise InvalidTaggerResponseError(tagger_token, resp_data)

        mods = ModificationTracker()
        cidmap = {}

        for cid, ann in ((i, a) for i, a in json_resp.items()
                         if _is_textbound(a)):
            assert 'offsets' in ann, 'Tagger response lacks offsets'
            offsets = ann['offsets']
            assert 'type' in ann, 'Tagger response lacks type'
            _type = ann['type']
            assert 'texts' in ann, 'Tagger response lacks texts'
            texts = ann['texts']

            # sanity
            assert len(offsets) != 0, 'Tagger response has empty offsets'
            assert len(texts) == len(
                offsets), 'Tagger response has different numbers of offsets and texts'

            start, end = offsets[0]
            text = texts[0]

            _id = ann_obj.get_new_id('T')
            cidmap[cid] = _id

            tb = TextBoundAnnotationWithText(
                offsets, _id, _type, text, " " + ' '.join(texts[1:]))

            mods.addition(tb)
            ann_obj.add_annotation(tb)

        for norm in (a for a in json_resp.values()
                     if _is_normalization(a)):
            try:
                _type = norm['type']
                target = norm['target']
                refdb = norm['refdb']
                refid = norm['refid']
            except KeyError as e:
                raise  # TODO

            _id = ann_obj.get_new_id('N')
            target = cidmap[target]

            na = NormalizationAnnotation(_id, _type, target, refdb, refid, '')

            mods.addition(na)
            ann_obj.add_annotation(na)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp