def search_event(collection, document, scope="collection", concordancing="false", context_length=50, text_match="word", match_case="false", type=None, trigger=DEFAULT_EMPTY_STRING, args={}): directory = collection # Interpret JSON booleans concordancing = _to_bool(concordancing) match_case = _to_bool(match_case) ann_objs = __doc_or_dir_to_annotations(directory, document, scope) restrict_types = [] if type is not None and type != "": restrict_types.append(type) # to get around lack of JSON object parsing in dispatcher, parse # args here. # TODO: parse JSON in dispatcher; this is far from the right place to do this.. from jsonwrap import loads args = loads(args) matches = search_anns_for_event(ann_objs, trigger, args, restrict_types=restrict_types, text_match=text_match, match_case=match_case) results = format_results(matches, concordancing, context_length) results['collection'] = directory return results
def suggest_span_types(collection, document, start, end, text, model): pconf = ProjectConfiguration(real_directory(collection)) for _, _, model_str, model_url in pconf.get_disambiguator_config(): if model_str == model: break else: # We were unable to find a matching model raise SimSemConnectionNotConfiguredError try: quoted_text = quote_plus(text) resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT) except URLError: # TODO: Could give more details raise SimSemConnectionError json = loads(resp.read()) preds = json['result'][text.decode('utf-8')] selected_preds = [] conf_sum = 0 for cat, conf in preds: selected_preds.append(( cat, conf, )) conf_sum += conf if conf_sum >= CUT_OFF: break log_annotation(collection, document, 'DONE', 'suggestion', [ None, None, text, ] + [ selected_preds, ]) # array so that server can control presentation order in UI # independently from scores if needed return { 'types': selected_preds, 'collection': collection, # echo for reference 'document': document, 'start': start, 'end': end, 'text': text, }
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) doc_path = path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: try: # Note: Can we actually fit a whole document in here? quoted_doc_text = quote_plus(ann_obj.get_document_text()) resp = urlopen(tagger_service_url % quoted_doc_text, None, QUERY_TIMEOUT) except URLError: raise TaggerConnectionError(tagger_token) # TODO: Check for errors json_resp = loads(resp.read()) mods = ModificationTracker() for ann_data in json_resp.itervalues(): offsets = ann_data['offsets'] # Note: We do not support discontinuous spans at this point assert len(offsets) == 1, 'discontinuous/null spans' start, end = offsets[0] _id = ann_obj.get_new_id('T') tb = TextBoundAnnotationWithText( start, end, _id, ann_data['type'], ann_data['text'] ) mods.addition(tb) ann_obj.add_annotation(tb) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp
def save_web_page_import(url, docid, overwrite, collection=None): ''' TODO: DOC: ''' directory = collection if directory is None: dir_path = DATA_DIR else: #XXX: These "security" measures can surely be fooled if (directory.count('../') or directory == '..'): raise InvalidDirError(directory) dir_path = real_directory(directory) # Is the directory a directory and are we allowed to write? if not isdir(dir_path): raise InvalidDirError(dir_path) if not access(dir_path, W_OK): raise NoWritePermissionError(dir_path) base_path = join_path(dir_path, docid) txt_path = base_path + '.' + TEXT_FILE_SUFFIX ann_path = base_path + '.' + JOINED_ANN_FILE_SUFF # Before we proceed, verify that we are not overwriting for path in (txt_path, ann_path): if isfile(path): if not overwrite or overwrite == 'false': raise FileExistsError(path) remove(path) apiUrl = 'http://api-ie.qna.bf2.yahoo.com:4080/ie_ws/v1/ie_ws?url=' + url data = getApiData(apiUrl) # location = join_path(dir_path, 'input.json') # data = getFileData(location) try: json_resp = loads(data) except ValueError, e: raise FormatError(apiUrl, e)
def suggest_span_types(collection, document, start, end, text, model): pconf = ProjectConfiguration(real_directory(collection)) for _, _, model_str, model_url in pconf.get_disambiguator_config(): if model_str == model: break else: # We were unable to find a matching model raise SimSemConnectionNotConfiguredError try: quoted_text = quote_plus(text) resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT) except URLError: # TODO: Could give more details raise SimSemConnectionError json = loads(resp.read()) preds = json['result'][text.decode('utf-8')] selected_preds = [] conf_sum = 0 for cat, conf in preds: selected_preds.append((cat, conf, )) conf_sum += conf if conf_sum >= CUT_OFF: break log_annotation(collection, document, 'DONE', 'suggestion', [None, None, text, ] + [selected_preds, ]) # array so that server can control presentation order in UI # independently from scores if needed return { 'types': selected_preds, 'collection': collection, # echo for reference 'document': document, 'start': start, 'end': end, 'text': text, }
def search_event(collection, type=None, trigger=DEFAULT_EMPTY_STRING, args={}): directory = collection ann_objs = __directory_to_annotations(directory) restrict_types = [] if type is not None and type != "": restrict_types.append(type) # to get around lack of JSON object parsing in dispatcher, parse # args here. # TODO: parse JSON in dispatcher; this is far from the right place to do this.. from jsonwrap import loads args = loads(args) matches = search_anns_for_event(ann_objs, trigger, args, restrict_types=restrict_types) results = format_results(matches) results['collection'] = directory return results
except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError(tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally: if conn is not None: conn.close() try: json_resp = loads(resp_data) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() for ann_data in json_resp.itervalues(): assert 'offsets' in ann_data, 'Tagger response lacks offsets' offsets = ann_data['offsets'] assert 'type' in ann_data, 'Tagger response lacks type' _type = ann_data['type'] assert 'texts' in ann_data, 'Tagger response lacks texts' texts = ann_data['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets'
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) print("tagger", tagger, file=sys.stderr) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) path_join(real_directory(collection), document) # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr) # print("tagger_token", tagger_token, file=sys.stderr) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: # print("ann_obj", document, file=sys.stderr) url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection # print("HTTPConnection", HTTPConnection, file=sys.stderr) elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from http.client import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ('?' + url_soup.query if url_soup.query else '') try: # Note: Trout slapping for anyone sending Unicode objects here data = str(path_join( real_directory(collection), document)) + "#*^$#" + ann_obj.get_document_text() data = data.encode('utf-8') # print("data", type(data),data, file=sys.stderr) # print("data", ann_obj, file=sys.stderr) req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request( 'POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(service_url), data, headers=req_headers) # httpConnection = http.client.HTTPConnection(url_soup.netloc) # httpConnection.request('GET', str(service_url), headers=req_headers) # response = httpConnection.getresponse() except SocketError as e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # print("resp-------------", resp.read(), file=sys.stderr) # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally: if conn is not None: conn.close() try: json_resp = loads(resp_data) # print("json_resp", json_resp, file=sys.stderr) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() cidmap = {} # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr) for cid, ann in ((i, a) for i, a in json_resp.items() if _is_textbound(a)): assert 'offsets' in ann, 'Tagger response lacks offsets' offsets = ann['offsets'] # print("json_resp.items:::::::::::::", offsets, file=sys.stderr) assert 'type' in ann, 'Tagger response lacks type' _type = ann['type'] assert 'texts' in ann, 'Tagger response lacks texts' texts = ann['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets ), 'Tagger response has different numbers of offsets and texts' start, end = offsets[0] text = texts[0] # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr) _id = ann_obj.get_new_id('T') print("_id", _id, file=sys.stderr) cidmap[cid] = _id tb = TextBoundAnnotationWithText(offsets, _id, _type, text, " " + ' '.join(texts[1:])) mods.addition(tb) ann_obj.add_annotation(tb) for norm in (a for a in json_resp.values() if _is_normalization(a)): try: _type = norm['type'] target = norm['target'] refdb = norm['refdb'] refid = norm['refid'] except KeyError as e: raise # TODO _id = ann_obj.get_new_id('N') target = cidmap[target] na = NormalizationAnnotation(_id, _type, target, refdb, refid, '') mods.addition(na) ann_obj.add_annotation(na) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp
headers=req_headers) except SocketError, e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) finally: if conn is not None: conn.close() resp_data = resp.read() try: json_resp = loads(resp_data) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() for ann_data in json_resp.itervalues(): assert 'offsets' in ann_data, 'Tagger response lacks offsets' offsets = ann_data['offsets'] assert 'type' in ann_data, 'Tagger response lacks type' _type = ann_data['type'] assert 'texts' in ann_data, 'Tagger response lacks texts' texts = ann_data['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets'
def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): if tagger == tagger_token: break else: raise UnknownTaggerError(tagger) path_join(real_directory(collection), document) with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: url_soup = urlparse(tagger_service_url) if url_soup.scheme == 'http': Connection = HTTPConnection elif url_soup.scheme == 'https': # Delayed HTTPS import since it relies on SSL which is commonly # missing if you roll your own Python, for once we should not # fail early since tagging is currently an edge case and we # can't allow it to bring down the whole server. from http.client import HTTPSConnection Connection = HTTPSConnection else: raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme) conn = None try: conn = Connection(url_soup.netloc) req_headers = { 'Content-type': 'text/plain; charset=utf-8', 'Accept': 'application/json', } # Build a new service URL since the request method doesn't accept # a parameters argument service_url = url_soup.path + ( '?' + url_soup.query if url_soup.query else '') try: data = ann_obj.get_document_text().encode('utf-8') req_headers['Content-length'] = len(data) # Note: Trout slapping for anyone sending Unicode objects here conn.request('POST', # As per: http://bugs.python.org/issue11898 # Force the url to be an ascii string str(service_url), data, headers=req_headers) except SocketError as e: raise TaggerConnectionError(tagger_token, e) resp = conn.getresponse() # Did the request succeed? if resp.status != 200: raise TaggerConnectionError( tagger_token, '%s %s' % (resp.status, resp.reason)) # Finally, we can read the response data resp_data = resp.read() finally: if conn is not None: conn.close() try: json_resp = loads(resp_data) except ValueError: raise InvalidTaggerResponseError(tagger_token, resp_data) mods = ModificationTracker() cidmap = {} for cid, ann in ((i, a) for i, a in json_resp.items() if _is_textbound(a)): assert 'offsets' in ann, 'Tagger response lacks offsets' offsets = ann['offsets'] assert 'type' in ann, 'Tagger response lacks type' _type = ann['type'] assert 'texts' in ann, 'Tagger response lacks texts' texts = ann['texts'] # sanity assert len(offsets) != 0, 'Tagger response has empty offsets' assert len(texts) == len( offsets), 'Tagger response has different numbers of offsets and texts' start, end = offsets[0] text = texts[0] _id = ann_obj.get_new_id('T') cidmap[cid] = _id tb = TextBoundAnnotationWithText( offsets, _id, _type, text, " " + ' '.join(texts[1:])) mods.addition(tb) ann_obj.add_annotation(tb) for norm in (a for a in json_resp.values() if _is_normalization(a)): try: _type = norm['type'] target = norm['target'] refdb = norm['refdb'] refid = norm['refid'] except KeyError as e: raise # TODO _id = ann_obj.get_new_id('N') target = cidmap[target] na = NormalizationAnnotation(_id, _type, target, refdb, refid, '') mods.addition(na) ann_obj.add_annotation(na) mod_resp = mods.json_response() mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp